From 0e6197fbed0ef52c46cf3575cb0770ade1fa877c Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 4 Mar 2018 16:03:19 +0000 Subject: [PATCH] Introduce accelerator friendly expression template rewrite. Must obtain and access lattice indexing through a view object that is safe to copy construct in copy to GPU (without copying the lattice). --- lib/lattice/Lattice_ET.h | 324 ++++++++++--------------- lib/lattice/Lattice_arith.h | 166 ++++++++----- lib/lattice/Lattice_comparison.h | 19 +- lib/lattice/Lattice_comparison_utils.h | 42 ++-- lib/lattice/Lattice_coordinate.h | 6 +- lib/lattice/Lattice_local.h | 24 +- lib/lattice/Lattice_matrix_reduction.h | 33 +-- lib/lattice/Lattice_peekpoke.h | 53 ++-- lib/lattice/Lattice_reality.h | 12 +- lib/lattice/Lattice_reduction.h | 37 +-- lib/lattice/Lattice_rng.h | 9 +- lib/lattice/Lattice_trace.h | 21 +- lib/lattice/Lattice_transfer.h | 93 ++++--- lib/lattice/Lattice_transpose.h | 16 +- lib/lattice/Lattice_unary.h | 38 +-- lib/lattice/Lattice_where.h | 90 ------- 16 files changed, 470 insertions(+), 513 deletions(-) delete mode 100644 lib/lattice/Lattice_where.h diff --git a/lib/lattice/Lattice_ET.h b/lib/lattice/Lattice_ET.h index d5d8ec8d..2cdc6b37 100644 --- a/lib/lattice/Lattice_ET.h +++ b/lib/lattice/Lattice_ET.h @@ -68,149 +68,139 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru return ret; } -//////////////////////////////////////////// -// recursive evaluation of expressions; Could -// switch to generic approach with variadics, a la -// Antonin's Lat Sim but the repack to variadic with popped -// from tuple is hideous; C++14 introduces std::make_index_sequence for this -//////////////////////////////////////////// - -// leaf eval of lattice ; should enable if protect using traits - -template -using is_lattice = std::is_base_of; - -template -using is_lattice_expr = std::is_base_of; - -template using is_lattice_expr = std::is_base_of; - +///////////////////////////////////////////////////// //Specialization of getVectorType for lattices +///////////////////////////////////////////////////// template struct getVectorType >{ typedef typename Lattice::vector_object type; }; - -template -inline sobj eval(const unsigned int ss, const sobj &arg) + +//////////////////////////////////////////// +//-- recursive evaluation of expressions; -- +// handle leaves of syntax tree +/////////////////////////////////////////////////// +template accelerator_inline +sobj eval(const unsigned int ss, const sobj &arg) { return arg; } -template -inline const lobj &eval(const unsigned int ss, const Lattice &arg) { + +template accelerator_inline +const lobj & eval(const unsigned int ss, const LatticeView &arg) +{ return arg[ss]; } - -// handle nodes in syntax tree -template -auto inline eval( - const unsigned int ss, - const LatticeUnaryExpression &expr) // eval one operand - -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) { - return expr.first.func(eval(ss, std::get<0>(expr.second))); +template accelerator_inline +const lobj & eval(const unsigned int ss, const Lattice &arg) +{ + auto view = arg.View(); + return view[ss]; } -template -auto inline eval( - const unsigned int ss, - const LatticeBinaryExpression &expr) // eval two operands - -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), - eval(ss, std::get<1>(expr.second)))) { - return expr.first.func(eval(ss, std::get<0>(expr.second)), - eval(ss, std::get<1>(expr.second))); +/////////////////////////////////////////////////// +// handle nodes in syntax tree- eval one operand +/////////////////////////////////////////////////// +template accelerator_inline +auto eval(const unsigned int ss, const LatticeUnaryExpression &expr) + -> decltype(expr.op.func( eval(ss, expr.arg1))) +{ + return expr.op.func( eval(ss, expr.arg1) ); } - -template -auto inline eval(const unsigned int ss, - const LatticeTrinaryExpression - &expr) // eval three operands - -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), - eval(ss, std::get<1>(expr.second)), - eval(ss, std::get<2>(expr.second)))) { - return expr.first.func(eval(ss, std::get<0>(expr.second)), - eval(ss, std::get<1>(expr.second)), - eval(ss, std::get<2>(expr.second))); +/////////////////////// +// eval two operands +/////////////////////// +template accelerator_inline +auto eval(const unsigned int ss, const LatticeBinaryExpression &expr) + -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) +{ + return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); +} +/////////////////////// +// eval three operands +/////////////////////// +template accelerator_inline +auto eval(const unsigned int ss, const LatticeTrinaryExpression &expr) + -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) +{ + return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); } ////////////////////////////////////////////////////////////////////////// // Obtain the grid from an expression, ensuring conformable. This must follow a -// tree recursion +// tree recursion; must retain grid pointer in the LatticeView class which sucks +// Use a different method, and make it void *. +// Perhaps a conformable method. ////////////////////////////////////////////////////////////////////////// -template ::value, T1>::type * = nullptr> +template ::value, T1>::type * = nullptr> inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf { - if (grid) { - conformable(grid, lat.Grid()); - } - grid = lat.Grid(); + lat.Conformable(grid); } -template ::value, T1>::type * = nullptr> -inline void GridFromExpression(GridBase *&grid, - const T1 ¬lat) // non-lattice leaf + +template ::value, T1>::type * = nullptr> +accelerator_inline +void GridFromExpression(GridBase *&grid,const T1 ¬lat) // non-lattice leaf {} + template -inline void GridFromExpression(GridBase *&grid, - const LatticeUnaryExpression &expr) { - GridFromExpression(grid, std::get<0>(expr.second)); // recurse +accelerator_inline +void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression &expr) +{ + GridFromExpression(grid, expr.arg1); // recurse } template -inline void GridFromExpression( - GridBase *&grid, const LatticeBinaryExpression &expr) { - GridFromExpression(grid, std::get<0>(expr.second)); // recurse - GridFromExpression(grid, std::get<1>(expr.second)); +accelerator_inline +void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression &expr) +{ + GridFromExpression(grid, expr.arg1); // recurse + GridFromExpression(grid, expr.arg2); } template -inline void GridFromExpression( - GridBase *&grid, const LatticeTrinaryExpression &expr) { - GridFromExpression(grid, std::get<0>(expr.second)); // recurse - GridFromExpression(grid, std::get<1>(expr.second)); - GridFromExpression(grid, std::get<2>(expr.second)); +accelerator_inline +void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression &expr) +{ + GridFromExpression(grid, expr.arg1); // recurse + GridFromExpression(grid, expr.arg2); // recurse + GridFromExpression(grid, expr.arg3); // recurse } ////////////////////////////////////////////////////////////////////////// // Obtain the CB from an expression, ensuring conformable. This must follow a // tree recursion ////////////////////////////////////////////////////////////////////////// -template ::value, T1>::type * = nullptr> +template ::value, T1>::type * = nullptr> inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf { if ((cb == Odd) || (cb == Even)) { assert(cb == lat.Checkerboard()); } cb = lat.Checkerboard(); - // std::cout<::value, T1>::type * = nullptr> +template ::value, T1>::type * = nullptr> inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf { - // std::cout< -inline void CBFromExpression(int &cb, - const LatticeUnaryExpression &expr) { - CBFromExpression(cb, std::get<0>(expr.second)); // recurse - // std::cout< -inline void CBFromExpression(int &cb, - const LatticeBinaryExpression &expr) { - CBFromExpression(cb, std::get<0>(expr.second)); // recurse - CBFromExpression(cb, std::get<1>(expr.second)); - // std::cout< inline +void CBFromExpression(int &cb,const LatticeUnaryExpression &expr) +{ + CBFromExpression(cb, expr.arg1); // recurse AST +} + +template inline +void CBFromExpression(int &cb,const LatticeBinaryExpression &expr) +{ + CBFromExpression(cb, expr.arg1); // recurse AST + CBFromExpression(cb, expr.arg2); // recurse AST } template -inline void CBFromExpression( - int &cb, const LatticeTrinaryExpression &expr) { - CBFromExpression(cb, std::get<0>(expr.second)); // recurse - CBFromExpression(cb, std::get<1>(expr.second)); - CBFromExpression(cb, std::get<2>(expr.second)); - // std::cout< &expr) +{ + CBFromExpression(cb, expr.arg1); // recurse AST + CBFromExpression(cb, expr.arg2); // recurse AST + CBFromExpression(cb, expr.arg3); // recurse AST } //////////////////////////////////////////// @@ -253,15 +243,16 @@ GridUnopClass(UnaryExp, exp(a)); template \ struct name { \ static auto inline func(const left &lhs, const right &rhs) \ - -> decltype(combination) const { \ + -> decltype(combination) const \ + { \ return combination; \ } \ - } + }; + GridBinOpClass(BinaryAdd, lhs + rhs); GridBinOpClass(BinarySub, lhs - rhs); GridBinOpClass(BinaryMul, lhs *rhs); GridBinOpClass(BinaryDiv, lhs /rhs); - GridBinOpClass(BinaryAnd, lhs &rhs); GridBinOpClass(BinaryOr, lhs | rhs); GridBinOpClass(BinaryAndAnd, lhs &&rhs); @@ -273,68 +264,50 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); #define GridTrinOpClass(name, combination) \ template \ struct name { \ - static auto inline func(const predicate &pred, const left &lhs, \ - const right &rhs) -> decltype(combination) const { \ + static auto inline func(const predicate &pred, const left &lhs, const right &rhs) \ + -> decltype(combination) const \ + { \ return combination; \ } \ - } + }; -GridTrinOpClass( - TrinaryWhere, - (predicatedWhere::type, - typename std::remove_reference::type>(pred, lhs, - rhs))); +GridTrinOpClass(TrinaryWhere, + (predicatedWhere::type, + typename std::remove_reference::type>(pred, lhs,rhs))); //////////////////////////////////////////// // Operator syntactical glue //////////////////////////////////////////// -#define GRID_UNOP(name) name -#define GRID_BINOP(name) name -#define GRID_TRINOP(name) \ - name +#define GRID_UNOP(name) name +#define GRID_BINOP(name) name +#define GRID_TRINOP(name) name #define GRID_DEF_UNOP(op, name) \ - template ::value || \ - is_lattice_expr::value, \ - T1>::type * = nullptr> \ - inline auto op(const T1 &arg) \ - ->decltype(LatticeUnaryExpression( \ - std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \ - return LatticeUnaryExpression( \ - std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \ + template ::value||is_lattice_expr::value,T1>::type * = nullptr> \ + inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression(GRID_UNOP(name)(), arg)) \ + { \ + return LatticeUnaryExpression(GRID_UNOP(name)(), arg); \ } #define GRID_BINOP_LEFT(op, name) \ template ::value || \ - is_lattice_expr::value, \ - T1>::type * = nullptr> \ + typename std::enable_if::value||is_lattice_expr::value,T1>::type * = nullptr> \ inline auto op(const T1 &lhs, const T2 &rhs) \ - ->decltype( \ - LatticeBinaryExpression( \ - std::make_pair(GRID_BINOP(name)(), \ - std::forward_as_tuple(lhs, rhs)))) { \ - return LatticeBinaryExpression( \ - std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \ + ->decltype(LatticeBinaryExpression(GRID_BINOP(name)(),lhs,rhs)) \ + { \ + return LatticeBinaryExpression(GRID_BINOP(name)(),lhs,rhs);\ } #define GRID_BINOP_RIGHT(op, name) \ template ::value && \ - !is_lattice_expr::value, \ - T1>::type * = nullptr, \ - typename std::enable_if::value || \ - is_lattice_expr::value, \ - T2>::type * = nullptr> \ + typename std::enable_if::value&&!is_lattice_expr::value,T1>::type * = nullptr, \ + typename std::enable_if< is_lattice::value|| is_lattice_expr::value,T2>::type * = nullptr> \ inline auto op(const T1 &lhs, const T2 &rhs) \ - ->decltype( \ - LatticeBinaryExpression( \ - std::make_pair(GRID_BINOP(name)(), \ - std::forward_as_tuple(lhs, rhs)))) { \ - return LatticeBinaryExpression( \ - std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \ + ->decltype(LatticeBinaryExpression(GRID_BINOP(name)(),lhs, rhs)) \ + { \ + return LatticeBinaryExpression(GRID_BINOP(name)(),lhs, rhs); \ } #define GRID_DEF_BINOP(op, name) \ @@ -344,18 +317,14 @@ GridTrinOpClass( #define GRID_DEF_TRINOP(op, name) \ template \ inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \ - ->decltype( \ - LatticeTrinaryExpression(std::make_pair( \ - GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \ - return LatticeTrinaryExpression(std::make_pair( \ - GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \ + ->decltype(LatticeTrinaryExpression(GRID_TRINOP(name)(),pred, lhs, rhs)) \ + { \ + return LatticeTrinaryExpression(GRID_TRINOP(name)(),pred, lhs, rhs); \ } + //////////////////////// // Operator definitions //////////////////////// - GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot); @@ -399,29 +368,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere); ///////////////////////////////////////////////////////////// template auto closure(const LatticeUnaryExpression &expr) - -> Lattice(expr.second))))> { - Lattice(expr.second))))> ret( - expr); + -> Lattice +{ + Lattice ret(expr); return ret; } template auto closure(const LatticeBinaryExpression &expr) - -> Lattice(expr.second)), - eval(0, std::get<1>(expr.second))))> { - Lattice(expr.second)), - eval(0, std::get<1>(expr.second))))> - ret(expr); + -> Lattice +{ + Lattice ret(expr); return ret; } template auto closure(const LatticeTrinaryExpression &expr) - -> Lattice(expr.second)), - eval(0, std::get<1>(expr.second)), - eval(0, std::get<2>(expr.second))))> { - Lattice(expr.second)), - eval(0, std::get<1>(expr.second)), - eval(0, std::get<2>(expr.second))))> - ret(expr); + -> Lattice +{ + Lattice ret(expr); return ret; } @@ -432,34 +399,7 @@ auto closure(const LatticeTrinaryExpression &expr) #undef GRID_DEF_UNOP #undef GRID_DEF_BINOP #undef GRID_DEF_TRINOP + NAMESPACE_END(Grid); -#if 0 -using namespace Grid; - -int main(int argc,char **argv){ - - Lattice v1(16); - Lattice v2(16); - Lattice v3(16); - - BinaryAdd tmp; - LatticeBinaryExpression,Lattice &,Lattice &> - expr(std::make_pair(tmp, - std::forward_as_tuple(v1,v2))); - tmp.func(eval(0,v1),eval(0,v2)); - - auto var = v1+v2; - std::cout< &v1,Lattice &v2,Lattice &v3) -{ - v3=v1+v2+v1*v2; -} -#endif - #endif diff --git a/lib/lattice/Lattice_arith.h b/lib/lattice/Lattice_arith.h index 7749aa02..117e7e6d 100644 --- a/lib/lattice/Lattice_arith.h +++ b/lib/lattice/Lattice_arith.h @@ -36,17 +36,20 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); conformable(ret,rhs); conformable(lhs,rhs); #ifdef STREAMING_STORES - accelerator_loop(ss,lhs,{ + accelerator_loop(ss,lhs_v,{ obj1 tmp; - mult(&tmp,&lhs[ss],&rhs[ss]); - vstream(ret[ss],tmp); + mult(&tmp,&lhs_v[ss],&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,lhs,{ - mult(&ret[ss],&lhs[ss],&rhs[ss]); + accelerator_loop(ss,lhs_v,{ + mult(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]); }); #endif } @@ -56,15 +59,18 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,lhs,{ + accelerator_loop(ss,lhs_v,{ obj1 tmp; - mac(&tmp,&lhs[ss],&rhs[ss]); - vstream(ret[ss],tmp); + mac(&tmp,&lhs_v[ss],&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,lhs,{ - mac(&ret[ss],&lhs[ss],&rhs[ss]); + accelerator_loop(ss,lhs_v,{ + mac(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]); }); #endif } @@ -74,15 +80,18 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,lhs,{ + accelerator_loop(ss,lhs_v,{ obj1 tmp; - sub(&tmp,&lhs[ss],&rhs[ss]); - vstream(ret[ss],tmp); + sub(&tmp,&lhs_v[ss],&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,lhs,{ - sub(&ret[ss],&lhs[ss],&rhs[ss]); + accelerator_loop(ss,lhs_v,{ + sub(&ret[ss],&lhs_v[ss],&rhs_v[ss]); }); #endif } @@ -91,15 +100,18 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,lhs,{ + accelerator_loop(ss,lhs_v,{ obj1 tmp; - add(&tmp,&lhs[ss],&rhs[ss]); - vstream(ret[ss],tmp); + add(&tmp,&lhs_v[ss],&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,lhs,{ - add(&ret[ss],&lhs[ss],&rhs[ss]); + accelerator_loop(ss,lhs_v,{ + add(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]); }); #endif } @@ -111,10 +123,12 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - accelerator_loop(ss,lhs,{ + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + accelerator_loop(ss,lhs_v,{ obj1 tmp; - mult(&tmp,&lhs[ss],&rhs); - vstream(ret[ss],tmp); + mult(&tmp,&lhs_v[ss],&rhs); + vstream(ret_v[ss],tmp); }); } @@ -122,10 +136,12 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - accelerator_loop(ss,lhs,{ + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + accelerator_loop(ss,lhs_v,{ obj1 tmp; - mac(&tmp,&lhs[ss],&rhs); - vstream(ret[ss],tmp); + mac(&tmp,&lhs_v[ss],&rhs); + vstream(ret_v[ss],tmp); }); } @@ -133,15 +149,17 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,lhs,{ + accelerator_loop(ss,lhs_v,{ obj1 tmp; - sub(&tmp,&lhs[ss],&rhs); - vstream(ret[ss],tmp); + sub(&tmp,&lhs_v[ss],&rhs); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,lhs,{ - sub(&ret[ss],&lhs[ss],&rhs); + accelerator_loop(ss,lhs_v,{ + sub(&ret_v[ss],&lhs_v[ss],&rhs); }); #endif } @@ -149,15 +167,17 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,lhs,{ + accelerator_loop(ss,lhs_v,{ obj1 tmp; - add(&tmp,&lhs[ss],&rhs); - vstream(ret[ss],tmp); + add(&tmp,&lhs_v[ss],&rhs); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,lhs,{ - add(&ret[ss],&lhs[ss],&rhs); + accelerator_loop(ss,lhs_v,{ + add(&ret_v[ss],&lhs_v[ss],&rhs); }); #endif } @@ -169,15 +189,17 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); + auto ret_v = ret.View(); + auto rhs_v = lhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,rhs,{ + accelerator_loop(ss,rhs_v,{ obj1 tmp; - mult(&tmp,&lhs,&rhs[ss]); - vstream(ret[ss],tmp); + mult(&tmp,&lhs,&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,rhs,{ - mult(&ret[ss],&lhs,&rhs[ss]); + accelerator_loop(ss,rhs_v,{ + mult(&ret_v[ss],&lhs,&rhs_v[ss]); }); #endif } @@ -186,15 +208,17 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); + auto ret_v = ret.View(); + auto rhs_v = lhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,rhs,{ + accelerator_loop(ss,rhs_v,{ obj1 tmp; - mac(&tmp,&lhs,&rhs[ss]); - vstream(ret[ss],tmp); + mac(&tmp,&lhs,&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,rhs,{ - mac(&ret[ss],&lhs,&rhs[ss]); + accelerator_loop(ss,rhs_v,{ + mac(&ret_v[ss],&lhs,&rhs_v[ss]); }); #endif } @@ -203,15 +227,17 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); + auto ret_v = ret.View(); + auto rhs_v = lhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,rhs,{ + accelerator_loop(ss,rhs_v,{ obj1 tmp; - sub(&tmp,&lhs,&rhs[ss]); - vstream(ret[ss],tmp); + sub(&tmp,&lhs,&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,rhs,{ - sub(&ret[ss],&lhs,&rhs[ss]); + accelerator_loop(ss,rhs_v,{ + sub(&ret_v[ss],&lhs,&rhs_v[ss]); }); #endif } @@ -219,15 +245,17 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); + auto ret_v = ret.View(); + auto rhs_v = lhs.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,rhs,{ + accelerator_loop(ss,rhs_v,{ obj1 tmp; - add(&tmp,&lhs,&rhs[ss]); - vstream(ret[ss],tmp); + add(&tmp,&lhs,&rhs_v[ss]); + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,rhs,{ - add(&ret[ss],&lhs,&rhs[ss]); + accelerator_loop(ss,rhs_v,{ + add(&ret_v[ss],&lhs,&rhs_v[ss]); }); #endif } @@ -237,14 +265,17 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); + auto ret_v = ret.View(); + auto x_v = x.View(); + auto y_v = y.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,x,{ - vobj tmp = a*x[ss]+y[ss]; - vstream(ret[ss],tmp); + accelerator_loop(ss,x_v,{ + vobj tmp = a*x_v[ss]+y_v[ss]; + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,x,{ - ret[ss]=a*x[ss]+y[ss]; + accelerator_loop(ss,x_v,{ + ret_v[ss]=a*x_v[ss]+y_v[ss]; }); #endif } @@ -253,14 +284,17 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); + auto ret_v = ret.View(); + auto x_v = x.View(); + auto y_v = y.View(); #ifdef STREAMING_STORES - accelerator_loop(ss,x,{ - vobj tmp = a*x[ss]+b*y[ss]; - vstream(ret[ss],tmp); + accelerator_loop(ss,x_v,{ + vobj tmp = a*x_v[ss]+b*y_v[ss]; + vstream(ret_v[ss],tmp); }); #else - accelerator_loop(ss,x,{ - ret[ss]=a*x[ss]+b*y[ss]; + accelerator_loop(ss,x_v,{ + ret_v[ss]=a*x_v[ss]+b*y_v[ss]; }); #endif } diff --git a/lib/lattice/Lattice_comparison.h b/lib/lattice/Lattice_comparison.h index 724fcbca..3a54c114 100644 --- a/lib/lattice/Lattice_comparison.h +++ b/lib/lattice/Lattice_comparison.h @@ -47,8 +47,11 @@ template inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - accelerator_loop( ss, rhs, { - ret[ss]=op(lhs[ss],rhs[ss]); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); + auto ret_v = ret.View(); + accelerator_loop( ss, rhs_v, { + ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); }); return ret; } @@ -59,8 +62,10 @@ template inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) { Lattice ret(lhs.Grid()); - accelerator_loop( ss, lhs, { - ret[ss]=op(lhs[ss],rhs); + auto lhs_v = lhs.View(); + auto ret_v = ret.View(); + accelerator_loop( ss, lhs_v, { + ret_v[ss]=op(lhs_v[ss],rhs); }); return ret; } @@ -71,8 +76,10 @@ template inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - accelerator_loop( ss, rhs, { - ret[ss]=op(lhs[ss],rhs); + auto rhs_v = rhs.View(); + auto ret_v = ret.View(); + accelerator_loop( ss, rhs_v, { + ret_v[ss]=op(lhs,rhs_v[ss]); }); return ret; } diff --git a/lib/lattice/Lattice_comparison_utils.h b/lib/lattice/Lattice_comparison_utils.h index 8359e217..408961ec 100644 --- a/lib/lattice/Lattice_comparison_utils.h +++ b/lib/lattice/Lattice_comparison_utils.h @@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid); // template class veq { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) == (rhs); } }; template class vne { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) != (rhs); } }; template class vlt { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) < (rhs); } }; template class vle { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) <= (rhs); } }; template class vgt { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) > (rhs); } }; template class vge { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) >= (rhs); } @@ -88,42 +88,42 @@ public: // Generic list of functors template class seq { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) == (rhs); } }; template class sne { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) != (rhs); } }; template class slt { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) < (rhs); } }; template class sle { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) <= (rhs); } }; template class sgt { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) > (rhs); } }; template class sge { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) >= (rhs); } @@ -133,7 +133,7 @@ public: // Integer and real get extra relational functions. ////////////////////////////////////////////////////////////////////////////////////////////////////// template = 0> -inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) +accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) { typedef typename vsimd::scalar_type scalar; ExtractBuffer vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation @@ -150,7 +150,7 @@ inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) } template = 0> -inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) +accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) { typedef typename vsimd::scalar_type scalar; ExtractBuffer vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation @@ -165,7 +165,7 @@ inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd: } template = 0> -inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) +accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) { typedef typename vsimd::scalar_type scalar; ExtractBuffer vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation @@ -181,35 +181,35 @@ inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, #define DECLARE_RELATIONAL(op,functor) \ template = 0> \ - inline vInteger operator op (const vsimd & lhs, const vsimd & rhs) \ + accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs) \ { \ typedef typename vsimd::scalar_type scalar; \ return Comparison(functor(),lhs,rhs); \ } \ template = 0> \ - inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ + accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ { \ typedef typename vsimd::scalar_type scalar; \ return Comparison(functor(),lhs,rhs); \ } \ template = 0> \ - inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ + accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ { \ typedef typename vsimd::scalar_type scalar; \ return Comparison(functor(),lhs,rhs); \ } \ template \ - inline vInteger operator op(const iScalar &lhs,const iScalar &rhs) \ + accelerator_inline vInteger operator op(const iScalar &lhs,const iScalar &rhs) \ { \ return lhs._internal op rhs._internal; \ } \ template \ - inline vInteger operator op(const iScalar &lhs,const typename vsimd::scalar_type &rhs) \ + accelerator_inline vInteger operator op(const iScalar &lhs,const typename vsimd::scalar_type &rhs) \ { \ return lhs._internal op rhs; \ } \ template \ - inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar &rhs) \ + accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar &rhs) \ { \ return lhs op rhs._internal; \ } diff --git a/lib/lattice/Lattice_coordinate.h b/lib/lattice/Lattice_coordinate.h index 46d77b09..c83e91a2 100644 --- a/lib/lattice/Lattice_coordinate.h +++ b/lib/lattice/Lattice_coordinate.h @@ -42,20 +42,22 @@ template inline void LatticeCoordinate(Lattice &l,int mu) ExtractBuffer mergebuf(Nsimd); vector_type vI; + auto l_v = l.View(); for(int o=0;ooSites();o++){ for(int i=0;iiSites();i++){ grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); mergebuf[i]=(Integer)gcoor[mu]; } merge(vI,mergebuf); - l[o]=vI; + l_v[o]=vI; } }; // LatticeCoordinate(); // FIXME for debug; deprecate this; made obscelete by template void lex_sites(Lattice &l){ - Real *v_ptr = (Real *)&l[0]; + auto l_v = l.View(); + Real *v_ptr = (Real *)&l_v[0]; size_t o_len = l.Grid()->oSites(); size_t v_len = sizeof(vobj)/sizeof(vRealF); size_t vec_len = vRealF::Nsimd(); diff --git a/lib/lattice/Lattice_local.h b/lib/lattice/Lattice_local.h index 1616f760..a270028a 100644 --- a/lib/lattice/Lattice_local.h +++ b/lib/lattice/Lattice_local.h @@ -43,8 +43,10 @@ template inline auto localNorm2 (const Lattice &rhs)-> Lattice { Lattice ret(rhs.Grid()); - accelerator_loop(ss,rhs,{ - ret[ss]=innerProduct(rhs[ss],rhs[ss]); + auto rhs_v = rhs.View(); + auto ret_v = ret.View(); + accelerator_loop(ss,rhs_v,{ + ret_v[ss]=innerProduct(rhs_v[ss],rhs_v[ss]); }); return ret; } @@ -54,8 +56,11 @@ template inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs.Grid()); - accelerator_loop(ss,rhs,{ - ret[ss]=innerProduct(lhs[ss],rhs[ss]); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); + auto ret_v = ret.View(); + accelerator_loop(ss,rhs_v,{ + ret_v[ss]=innerProduct(lhs_v[ss],rhs_v[ss]); }); return ret; } @@ -63,11 +68,14 @@ inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs // outerProduct Scalar x Scalar -> Scalar // Vector x Vector -> Matrix template -inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice +inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { - Lattice ret(rhs.Grid()); - accelerator_loop(ss,rhs,{ - ret[ss]=outerProduct(lhs[ss],rhs[ss]); + Lattice ret(rhs.Grid()); + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); + auto ret_v = ret.View(); + accelerator_loop(ss,rhs_v,{ + ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]); }); return ret; } diff --git a/lib/lattice/Lattice_matrix_reduction.h b/lib/lattice/Lattice_matrix_reduction.h index 957a9bce..ccbc13b2 100644 --- a/lib/lattice/Lattice_matrix_reduction.h +++ b/lib/lattice/Lattice_matrix_reduction.h @@ -51,6 +51,9 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; + auto X_v = X.View(); + auto Y_v = Y.View(); + auto R_v = R.View(); thread_region { std::vector s_x(Nblock); @@ -60,16 +63,16 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int o = n*stride + b; for(int i=0;i &R,Eigen::MatrixXcd &aa,const Lattice< int Nblock = X.Grid()->GlobalDimensions()[Orthog]; GridBase *FullGrid = X.Grid(); - // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); - // Lattice Xslice(SliceGrid); - // Lattice Rslice(SliceGrid); - assert( FullGrid->_simd_layout[Orthog]==1); - // int nh = FullGrid->_ndimension; - // int nl = SliceGrid->_ndimension; - // int nl=1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -100,16 +96,20 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; + + auto X_v = X.View(); + auto R_v = R.View(); + thread_region { std::vector s_x(Nblock); - + thread_loop_collapse2( (int n=0;n &R,Eigen::MatrixXcd &aa,const Lattice< for(int j=1;j int ostride=FullGrid->_ostride[Orthog]; typedef typename vobj::vector_typeD vector_typeD; - + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); thread_region { std::vector Left(Nblock); std::vector Right(Nblock); @@ -168,8 +169,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice int o = n*stride + b; for(int i=0;i -auto PeekIndex(const Lattice &lhs,int i) -> Lattice(lhs[0],i))> +auto PeekIndex(const Lattice &lhs,int i) -> Lattice(vobj(),i))> { - Lattice(lhs[0],i))> ret(lhs.Grid()); + Lattice(vobj(),i))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - cpu_loop( ss, lhs, { - ret[ss] = peekIndex(lhs[ss],i); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + cpu_loop( ss, lhs_v, { + ret_v[ss] = peekIndex(lhs_v[ss],i); }); return ret; }; template -auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice(lhs[0],i,j))> +auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice(vobj(),i,j))> { - Lattice(lhs[0],i,j))> ret(lhs.Grid()); + Lattice(vobj(),i,j))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - cpu_loop( ss, lhs, { - ret[ss] = peekIndex(lhs[ss],i,j); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + cpu_loop( ss, lhs_v, { + ret_v[ss] = peekIndex(lhs_v[ss],i,j); }); return ret; }; @@ -66,17 +70,21 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice -void PokeIndex(Lattice &lhs,const Lattice(lhs[0],0))> & rhs,int i) +void PokeIndex(Lattice &lhs,const Lattice(vobj(),0))> & rhs,int i) { - cpu_loop( ss, lhs, { - pokeIndex(lhs[ss],rhs[ss],i); + auto rhs_v = rhs.View(); + auto lhs_v = lhs.View(); + cpu_loop( ss, lhs_v, { + pokeIndex(lhs_v[ss],rhs_v[ss],i); }); } template -void PokeIndex(Lattice &lhs,const Lattice(lhs[0],0,0))> & rhs,int i,int j) +void PokeIndex(Lattice &lhs,const Lattice(vobj(),0,0))> & rhs,int i,int j) { - cpu_loop( ss, lhs, { - pokeIndex(lhs[ss],rhs[ss],i,j); + auto rhs_v = rhs.View(); + auto lhs_v = lhs.View(); + cpu_loop( ss, lhs_v, { + pokeIndex(lhs_v[ss],rhs_v[ss],i,j); }); } @@ -103,10 +111,11 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ // extract-modify-merge cycle is easiest way and this is not perf critical ExtractBuffer buf(Nsimd); + auto l_v = l.View(); if ( rank == grid->ThisRank() ) { - extract(l[odx],buf); + extract(l_v[odx],buf); buf[idx] = s; - merge(l[odx],buf); + merge(l_v[odx],buf); } return; @@ -132,7 +141,8 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ grid->GlobalCoorToRankIndex(rank,odx,idx,site); ExtractBuffer buf(Nsimd); - extract(l[odx],buf); + auto l_v = l.View(); + extract(l_v[odx],buf); s = buf[idx]; @@ -162,8 +172,9 @@ void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ int odx,idx; idx= grid->iIndex(site); odx= grid->oIndex(site); - - scalar_type * vp = (scalar_type *)&l[odx]; + + auto l_v = l.View(); + scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - scalar_type * vp = (scalar_type *)&l[odx]; + auto l_v = l.View(); + scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * pt = (scalar_type *)&s; - for(int w=0;w inline Lattice adj(const Lattice &lhs){ Lattice ret(lhs.Grid()); - accelerator_loop( ss, lhs, { - ret[ss] = adj(lhs[ss]); + auto lhs_v = lhs.View(); + auto ret_v = ret.View(); + accelerator_loop( ss, lhs_v, { + ret_v[ss] = adj(lhs_v[ss]); }); return ret; }; template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs.Grid()); - accelerator_loop( ss, lhs, { - ret[ss] = conjugate(lhs[ss]); + auto lhs_v = lhs.View(); + auto ret_v = ret.View(); + accelerator_loop( ss, lhs_v, { + ret_v[ss] = conjugate(lhs_v[ss]); }); return ret; }; diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h index 35ea1e95..6c863afc 100644 --- a/lib/lattice/Lattice_reduction.h +++ b/lib/lattice/Lattice_reduction.h @@ -47,14 +47,17 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ GridBase *grid = left.Grid(); std::vector > sumarray(grid->SumArraySize()); - + + auto left_v = left.View(); + auto right_v=right.View(); + thread_loop( (int thr=0;thrSumArraySize();thr++),{ int mywork, myoff; GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff); - decltype(innerProductD(left[0],right[0])) vnrm=Zero(); // private to thread; sub summation + decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation for(int ss=myoff;ss &left,const Lattice &righ template inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object + ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object { return sum(closure(expr)); } template inline auto sum(const LatticeBinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object + ->typename decltype(expr.op.func(eval(0,expr.arg1,eval(0,expr.arg2))))::scalar_object { return sum(closure(expr)); } @@ -85,10 +88,10 @@ inline auto sum(const LatticeBinaryExpression & expr) template inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), - eval(0,std::get<1>(expr.second)), - eval(0,std::get<2>(expr.second)) - ))::scalar_object + ->typename decltype(expr.op.func(eval(0,expr.arg1), + eval(0,expr.arg2), + eval(0,expr.arg3) + ))::scalar_object { return sum(closure(expr)); } @@ -103,14 +106,14 @@ inline typename vobj::scalar_object sum(const Lattice &arg) for(int i=0;iSumArraySize();i++){ sumarray[i]=Zero(); } - + auto arg_v = arg.View(); thread_loop( (int thr=0;thrSumArraySize();thr++),{ int mywork, myoff; GridThread::GetWork(grid->oSites(),thr,mywork,myoff); vobj vvsum=Zero(); for(int ss=myoff;ss inline void sliceSum(const Lattice &Data,std::vector< int stride=grid->_slice_stride[orthogdim]; // sum over reduced dimension planes, breaking out orthog dir + auto Data_v = Data.View(); thread_loop( (int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -179,7 +183,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< for(int n=0;n & result, const Latti int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; + auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(); thread_loop( (int r=0;r_ostride[orthogdim]; // base offset for start of plane @@ -258,7 +264,7 @@ static void sliceInnerProductVector( std::vector & result, const Latti for(int n=0;n &R,std::vector &a,const Lattice tensor_reduced at; at=av; + auto X_v = X.View(); + auto Y_v = Y.View(); + auto R_v = R.View(); thread_loop_collapse2( (int n=0;noSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int words = sizeof(scalar_object) / sizeof(scalar_type); - thread_loop( (int ss=0;ss buf(Nsimd); for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times @@ -361,9 +363,10 @@ public: fillScalar(pointer[idx], dist[gdx], _generators[gdx]); } // merge into SIMD lanes, FIXME suboptimal implementation - merge(l[sm], buf); + merge(l_v[sm], buf); } - }); + } + // }); _time_counter += usecond()- inner_time_counter; } diff --git a/lib/lattice/Lattice_trace.h b/lib/lattice/Lattice_trace.h index 57d6dbce..45f9e47a 100644 --- a/lib/lattice/Lattice_trace.h +++ b/lib/lattice/Lattice_trace.h @@ -38,12 +38,13 @@ NAMESPACE_BEGIN(Grid); // Trace //////////////////////////////////////////////////////////////////////////////////////////////////// template -inline auto trace(const Lattice &lhs) - -> Lattice +inline auto trace(const Lattice &lhs) -> Lattice { - Lattice ret(lhs.Grid()); - accelerator_loop( ss, lhs, { - ret[ss] = trace(lhs[ss]); + Lattice ret(lhs.Grid()); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + accelerator_loop( ss, lhs_v, { + ret_v[ss] = trace(lhs_v[ss]); }); return ret; }; @@ -52,11 +53,13 @@ inline auto trace(const Lattice &lhs) // Trace Index level dependent operation //////////////////////////////////////////////////////////////////////////////////////////////////// template -inline auto TraceIndex(const Lattice &lhs) -> Lattice(lhs[0]))> +inline auto TraceIndex(const Lattice &lhs) -> Lattice(vobj()))> { - Lattice(lhs[0]))> ret(lhs.Grid()); - accelerator_loop( ss, lhs, { - ret[ss] = traceIndex(lhs[ss]); + Lattice(vobj()))> ret(lhs.Grid()); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + accelerator_loop( ss, lhs_v, { + ret_v[ss] = traceIndex(lhs_v[ss]); }); return ret; }; diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index ba23b0cb..821890a5 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -51,20 +51,24 @@ inline void subdivides(GridBase *coarse,GridBase *fine) template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.Checkerboard() = cb; + auto half_v = half.View(); + auto full_v = full.View(); thread_loop( (int ss=0;ssoSites();ss++),{ int cbos; Coordinate coor; full.Grid()->oCoorFromOindex(coor,ss); cbos=half.Grid()->CheckerBoard(coor); - + if (cbos==cb) { int ssh=half.Grid()->oIndex(coor); - half[ssh] = full[ss]; + half_v[ssh] = full_v[ss]; } }); } template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.Checkerboard(); + auto half_v = half.View(); + auto full_v = full.View(); thread_loop( (int ss=0;ssoSites();ss++), { Coordinate coor; int cbos; @@ -74,7 +78,7 @@ template inline void setCheckerboard(Lattice &full,const Latti if (cbos==cb) { int ssh=half.Grid()->oIndex(coor); - full[ss]=half[ssh]; + full_v[ss]=half_v[ssh]; } }); } @@ -105,6 +109,8 @@ inline void blockProject(Lattice > &coarseData, coarseData=Zero(); + auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(); // Loop over coars parallel, and then loop over fine associated with coarse. thread_loop( (int sf=0;sfoSites();sf++),{ @@ -117,9 +123,8 @@ inline void blockProject(Lattice > &coarseData, thread_critical { for(int i=0;i &fineZ, assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } + auto fineZ_ = fineZ.View(); + auto fineX_ = fineX.View(); + auto fineY_ = fineY.View(); + auto coarseA_= coarseA.View(); + thread_loop( (int sf=0;sfoSites();sf++),{ int sc; @@ -162,7 +172,7 @@ inline void blockZAXPY(Lattice &fineZ, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); // z = A x + y - fineZ[sf]=coarseA[sc]*fineX[sf]+fineY[sf]; + fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf]; }); @@ -173,7 +183,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, const Lattice &fineX, const Lattice &fineY) { - typedef decltype(innerProduct(fineX[0],fineY[0])) dotp; + typedef decltype(innerProduct(vobj(),vobj())) dotp; GridBase *coarse(CoarseInner.Grid()); GridBase *fine (fineX.Grid()); @@ -182,10 +192,13 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? + auto CoarseInner_ = CoarseInner.View(); + auto coarse_inner_ = coarse_inner.View(); + fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); thread_loop( (int ss=0;ssoSites();ss++),{ - CoarseInner[ss] = coarse_inner[ss]; + CoarseInner_[ss] = coarse_inner_[ss]; }); } template @@ -218,24 +231,23 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) // Turn this around to loop threaded over sc and interior loop // over sf would thread better coarseData=Zero(); - thread_region { + auto coarseData_ = coarseData.View(); + auto fineData_ = fineData.View(); + thread_loop( (int sf=0;sfoSites();sf++),{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); - - thread_loop_in_region( (int sf=0;sfoSites();sf++),{ - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - thread_critical { - coarseData[sc]=coarseData[sc]+fineData[sf]; - } + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + + thread_critical { + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + } - }); - } + }); return; } @@ -306,25 +318,25 @@ inline void blockPromote(const Lattice > &coarseData, for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } + auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(); // Loop with a cache friendly loop ordering - thread_region { + thread_loop( (int sf=0;sfoSites();sf++),{ int sc; Coordinate coor_c(_ndimension); Coordinate coor_f(_ndimension); - thread_loop_in_region( (int sf=0;sfoSites();sf++),{ + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - - for(int i=0;i &out, const Lattice &in) } //loop over outer index + auto in_v = in.View(); thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //Assemble vector of pointers to output elements ExtractPointerArray out_ptrs(in_nsimd); @@ -587,16 +600,19 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) Coordinate lcoor(in_grid->Nd()); for(int lane=0; lane < in_nsimd; lane++){ - for(int mu=0;mu_rdimensions[mu]*in_icoor[lane][mu]; + } int lex; Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions); + assert(lex < out.size()); out_ptrs[lane] = &out[lex]; } //Unpack into those ptrs - const vobj & in_vobj = in[in_oidx]; + const vobj & in_vobj = in_v[in_oidx]; extract(in_vobj, out_ptrs, 0); }); } @@ -621,7 +637,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) icoor[lane].resize(ndim); grid->iCoorFromIindex(icoor[lane],lane); } - + auto out_v = out.View(); thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //Assemble vector of pointers to output elements ExtractPointerArray ptrs(nsimd); @@ -644,7 +660,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) //pack from those ptrs vobj vecobj; merge(vecobj, ptrs, 0); - out[oidx] = vecobj; + out_v[oidx] = vecobj; }); } @@ -673,6 +689,7 @@ void precisionChange(Lattice &out, const Lattice &in){ std::vector in_slex_conv(in_grid->lSites()); unvectorizeToLexOrdArray(in_slex_conv, in); + auto out_v = out.View(); thread_loop( (uint64_t out_oidx=0;out_oidxoSites();out_oidx++),{ Coordinate out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); @@ -688,7 +705,7 @@ void precisionChange(Lattice &out, const Lattice &in){ int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions); ptrs[lane] = &in_slex_conv[llex]; } - merge(out[out_oidx], ptrs, 0); + merge(out_v[out_oidx], ptrs, 0); }); } diff --git a/lib/lattice/Lattice_transpose.h b/lib/lattice/Lattice_transpose.h index dcfca494..facc2be3 100644 --- a/lib/lattice/Lattice_transpose.h +++ b/lib/lattice/Lattice_transpose.h @@ -41,8 +41,10 @@ NAMESPACE_BEGIN(Grid); template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs.Grid()); - accelerator_loop(ss,lhs,{ - ret[ss] = transpose(lhs[ss]); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + accelerator_loop(ss,lhs_v,{ + ret_v[ss] = transpose(lhs_v[ss]); }); return ret; }; @@ -51,11 +53,13 @@ inline Lattice transpose(const Lattice &lhs){ // Index level dependent transpose //////////////////////////////////////////////////////////////////////////////////////////////////// template -inline auto TransposeIndex(const Lattice &lhs) -> Lattice(lhs[0]))> +inline auto TransposeIndex(const Lattice &lhs) -> Lattice(vobj()))> { - Lattice(lhs[0]))> ret(lhs.Grid()); - accelerator_loop(ss,lhs,{ - ret[ss] = transposeIndex(lhs[ss]); + Lattice(vobj()))> ret(lhs.Grid()); + auto ret_v = ret.View(); + auto lhs_v = lhs.View(); + accelerator_loop(ss,lhs_v,{ + ret_v[ss] = transposeIndex(lhs_v[ss]); }); return ret; }; diff --git a/lib/lattice/Lattice_unary.h b/lib/lattice/Lattice_unary.h index a5ac0bb7..25621a0b 100644 --- a/lib/lattice/Lattice_unary.h +++ b/lib/lattice/Lattice_unary.h @@ -33,43 +33,47 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -template Lattice pow(const Lattice &rhs,RealD y){ - Lattice ret(rhs.Grid()); +template Lattice pow(const Lattice &rhs_i,RealD y){ + Lattice ret_i(rhs_i.Grid()); + auto rhs = rhs_i.View(); + auto ret = ret_i.View(); ret.Checkerboard() = rhs.Checkerboard(); - conformable(ret,rhs); accelerator_loop(ss,rhs,{ ret[ss]=pow(rhs[ss],y); }); - return ret; + return ret_i; } -template Lattice mod(const Lattice &rhs,Integer y){ - Lattice ret(rhs.Grid()); +template Lattice mod(const Lattice &rhs_i,Integer y){ + Lattice ret_i(rhs_i.Grid()); + auto rhs = rhs_i.View(); + auto ret = ret_i.View(); ret.Checkerboard() = rhs.Checkerboard(); - conformable(ret,rhs); accelerator_loop(ss,rhs,{ ret[ss]=mod(rhs[ss],y); }); - return ret; + return ret_i; } -template Lattice div(const Lattice &rhs,Integer y){ - Lattice ret(rhs.Grid()); - ret.Checkerboard() = rhs.Checkerboard(); - conformable(ret,rhs); +template Lattice div(const Lattice &rhs_i,Integer y){ + Lattice ret_i(rhs_i.Grid()); + auto ret = ret_i.View(); + auto rhs = rhs_i.View(); + ret.Checkerboard() = rhs_i.Checkerboard(); accelerator_loop(ss,rhs,{ ret[ss]=div(rhs[ss],y); }); - return ret; + return ret_i; } -template Lattice expMat(const Lattice &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ - Lattice ret(rhs.Grid()); +template Lattice expMat(const Lattice &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ + Lattice ret_i(rhs_i.Grid()); + auto rhs = rhs_i.View(); + auto ret = ret_i.View(); ret.Checkerboard() = rhs.Checkerboard(); - conformable(ret,rhs); accelerator_loop(ss,rhs,{ ret[ss]=Exponentiate(rhs[ss],alpha, Nexp); }); - return ret; + return ret_i; } NAMESPACE_END(Grid); diff --git a/lib/lattice/Lattice_where.h b/lib/lattice/Lattice_where.h deleted file mode 100644 index 3aa9098c..00000000 --- a/lib/lattice/Lattice_where.h +++ /dev/null @@ -1,90 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/lattice/Lattice_where.h - - Copyright (C) 2015 - -Author: Azusa Yamaguchi -Author: Peter Boyle -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#ifndef GRID_LATTICE_WHERE_H -#define GRID_LATTICE_WHERE_H - -NAMESPACE_BEGIN(Grid); - -// Must implement the predicate gating the -// Must be able to reduce the predicate down to a single vInteger per site. -// Must be able to require the type be iScalar x iScalar x .... -// give a GetVtype method in iScalar -// and blow away the tensor structures. -// -template -inline void whereWolf(Lattice &ret,const Lattice &predicate,Lattice &iftrue,Lattice &iffalse) -{ - conformable(iftrue,iffalse); - conformable(iftrue,predicate); - conformable(iftrue,ret); - - GridBase *grid=iftrue.Grid(); - - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename iobj::vector_type mask_type; - - const int Nsimd = grid->Nsimd(); - - std::vector mask(Nsimd); - std::vector truevals (Nsimd); - std::vector falsevals(Nsimd); - - thread_loop( (int ss=iftrue.begin(); ss(TensorRemove(predicate[ss]),mask); - - for(int s=0;s -inline Lattice whereWolf(const Lattice &predicate,Lattice &iftrue,Lattice &iffalse) -{ - conformable(iftrue,iffalse); - conformable(iftrue,predicate); - - Lattice ret(iftrue.Grid()); - - where(ret,predicate,iftrue,iffalse); - - return ret; -} - -NAMESPACE_END(Grid); -#endif