From 47b89d2739facbea82f96a52c836173dfc37f35a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 26 Aug 2020 17:04:27 -0400 Subject: [PATCH 01/32] Pragma protection improvementt --- Grid/Grid_Eigen_Dense.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index 9556c03d..e74d3894 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -42,7 +42,7 @@ #ifdef __NVCC__REDEFINE__ #pragma pop_macro("__CUDACC__") #pragma pop_macro("__NVCC__") -#pragma pop_macro("GRID_SIMT") +#pragma pop_macro("__CUDA_ARCH__") #pragma pop #endif From 3448b7387c80d9789ce0c18e2e94f1f2d70bdd9e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 26 Aug 2020 17:04:49 -0400 Subject: [PATCH 02/32] Almost there to coalesced ET --- Grid/lattice/Lattice_ET.h | 102 +++++++++++++++++++++++++++++++----- Grid/lattice/Lattice_base.h | 15 +++--- 2 files changed, 98 insertions(+), 19 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index 4407be04..24ec812c 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -42,9 +42,25 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////////////// // Predicated where support //////////////////////////////////////////////////// +#ifdef GRID_SIMT template -accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, - const robj &iffalse) { +accelerator_inline vobj predicatedWhere(const iobj &predicate, + const vobj &iftrue, + const robj &iffalse) +{ + // should drop to sccalar in SIMT + // typename std::remove_const::type ret; + // Integer mask = TensorRemove(predicate); + // ret = iffalse; + // if (TensorRemove(mask)) ret=iftrue; + return iftrue; +} +#else +template +accelerator_inline vobj predicatedWhere(const iobj &predicate, + const vobj &iftrue, + const robj &iffalse) +{ typename std::remove_const::type ret; typedef typename vobj::scalar_object scalar_object; @@ -68,6 +84,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru merge(ret, falsevals); return ret; } +#endif ///////////////////////////////////////////////////// //Specialization of getVectorType for lattices @@ -86,13 +103,45 @@ sobj eval(const uint64_t ss, const sobj &arg) { return arg; } - template accelerator_inline const lobj & eval(const uint64_t ss, const LatticeView &arg) { return arg[ss]; } +template::value&&!is_lattice_expr::value,sobj>::type * = nullptr> +accelerator_inline +sobj coalescedEval(const uint64_t ss, const sobj &arg) +{ + return arg; +} +#ifdef GRID_SIMT +#warning device +template accelerator_inline +typename lobj::scalar_object coalescedEval(const uint64_t ss, const LatticeView &arg) +{ + auto ret = arg(ss); + return ret; +} +#else +#warning host +template accelerator_inline +lobj coalescedEval(const uint64_t ss, const LatticeView &arg) +{ + // return coalescedRead(arg[ss]); + return arg[ss]; +} +#endif +/* +template accelerator_inline +typename lobj::scalar_type coalescedEval(const uint64_t ss, const Lattice &arg) +{ + assert(0); + return coalescedRead(arg[ss]); +} +*/ + // What needs this? // Cannot be legal on accelerator // Comparison must convert @@ -115,18 +164,14 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression &expr) { return expr.op.func( eval(ss, expr.arg1) ); } -/////////////////////// // eval two operands -/////////////////////// template accelerator_inline auto eval(const uint64_t ss, const LatticeBinaryExpression &expr) -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) { return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); } -/////////////////////// // eval three operands -/////////////////////// template accelerator_inline auto eval(const uint64_t ss, const LatticeTrinaryExpression &expr) -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) @@ -134,6 +179,34 @@ auto eval(const uint64_t ss, const LatticeTrinaryExpression &exp return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); } +/////////////////////////////////////////////////// +// handle nodes in syntax tree- eval one operand coalesced +/////////////////////////////////////////////////// +template accelerator_inline +auto coalescedEval(const uint64_t ss, const LatticeUnaryExpression &expr) + -> decltype(expr.op.func( coalescedEval(ss, expr.arg1))) +{ + return expr.op.func( coalescedEval(ss, expr.arg1) ); +} +// eval two operands +template accelerator_inline +auto coalescedEval(const uint64_t ss, const LatticeBinaryExpression &expr) + -> decltype(expr.op.func( coalescedEval(ss,expr.arg1),coalescedEval(ss,expr.arg2))) +{ + return expr.op.func( coalescedEval(ss,expr.arg1), coalescedEval(ss,expr.arg2) ); +} +// eval three operands +template accelerator_inline +auto coalescedEval(const uint64_t ss, const LatticeTrinaryExpression &expr) + -> decltype(expr.op.func(coalescedEval(ss, expr.arg1), + coalescedEval(ss, expr.arg2), + coalescedEval(ss, expr.arg3))) +{ + return expr.op.func(coalescedEval(ss, expr.arg1), + coalescedEval(ss, expr.arg2), + coalescedEval(ss, expr.arg3)); +} + ////////////////////////////////////////////////////////////////////////// // Obtain the grid from an expression, ensuring conformable. This must follow a // tree recursion; must retain grid pointer in the LatticeView class which sucks @@ -275,7 +348,7 @@ inline void ExpressionViewClose(LatticeTrinaryExpression &expr) #define GridUnopClass(name, ret) \ template \ struct name { \ - static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \ + template static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \ }; GridUnopClass(UnarySub, -a); @@ -308,8 +381,9 @@ GridUnopClass(UnaryExp, exp(a)); #define GridBinOpClass(name, combination) \ template \ struct name { \ + template \ static auto accelerator_inline \ - func(const left &lhs, const right &rhs) \ + func(const _left &lhs, const _right &rhs) \ -> decltype(combination) const \ { \ return combination; \ @@ -331,8 +405,9 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); #define GridTrinOpClass(name, combination) \ template \ struct name { \ + template \ static auto accelerator_inline \ - func(const predicate &pred, const left &lhs, const right &rhs) \ + func(const _predicate &pred, const _left &lhs, const _right &rhs) \ -> decltype(combination) const \ { \ return combination; \ @@ -340,9 +415,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); }; GridTrinOpClass(TrinaryWhere, - (predicatedWhere::type, - typename std::remove_reference::type>(pred, lhs,rhs))); + (predicatedWhere< + typename std::remove_reference<_predicate>::type, + typename std::remove_reference<_left>::type, + typename std::remove_reference<_right>::type>(pred, lhs,rhs))); //////////////////////////////////////////// // Operator syntactical glue diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 73b1b6a1..2d972970 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -124,8 +124,9 @@ public: ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,exprCopy); - vstream(me[ss],tmp); + auto tmp = coalescedEval(ss,exprCopy); + coalescedWrite(me[ss],tmp); + // me[ss]=tmp; }); me.ViewClose(); ExpressionViewClose(exprCopy); @@ -147,8 +148,9 @@ public: ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,exprCopy); - vstream(me[ss],tmp); + auto tmp = coalescedEval(ss,exprCopy); + coalescedWrite(me[ss],tmp); + //me[ss]=tmp; }); me.ViewClose(); ExpressionViewClose(exprCopy); @@ -169,8 +171,9 @@ public: ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ - auto tmp = eval(ss,exprCopy); - vstream(me[ss],tmp); + auto tmp = coalescedEval(ss,exprCopy); + coalescedWrite(me[ss],tmp); + // me[ss]=tmp; }); me.ViewClose(); ExpressionViewClose(exprCopy); From 65920faebae56dd8e28d43442fe4010c2f491de0 Mon Sep 17 00:00:00 2001 From: Bartosz Kostrzewa Date: Mon, 31 Aug 2020 18:39:27 +0200 Subject: [PATCH 03/32] correct formatting of Benchmark_wilson_sweep output --- benchmarks/Benchmark_wilson_sweep.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc index f1473b44..100e09f5 100644 --- a/benchmarks/Benchmark_wilson_sweep.cc +++ b/benchmarks/Benchmark_wilson_sweep.cc @@ -89,6 +89,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage; std::cout << latt_size; + std::cout << "\t\t"; GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian RBGrid(&Grid); From a9b92867a817deed5c041a268d97c54d0674e56a Mon Sep 17 00:00:00 2001 From: Bartosz Kostrzewa Date: Mon, 31 Aug 2020 18:41:17 +0200 Subject: [PATCH 04/32] use tabulator --- benchmarks/Benchmark_wilson_sweep.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc index 100e09f5..986a1831 100644 --- a/benchmarks/Benchmark_wilson_sweep.cc +++ b/benchmarks/Benchmark_wilson_sweep.cc @@ -89,7 +89,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage; std::cout << latt_size; - std::cout << "\t\t"; + std::cout << "\t\t"; GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridRedBlackCartesian RBGrid(&Grid); From 54523369a3096750dac18f2150a23d6d98709030 Mon Sep 17 00:00:00 2001 From: Bartosz Kostrzewa Date: Mon, 31 Aug 2020 19:39:36 +0200 Subject: [PATCH 05/32] do not use backspace in Coordinate output stream operator --- Grid/util/Coordinate.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h index 7f1d31c0..004fbc72 100644 --- a/Grid/util/Coordinate.h +++ b/Grid/util/Coordinate.h @@ -99,10 +99,10 @@ inline std::ostream & operator<<(std::ostream &os, const AcceleratorVector 0) { - os << "\b"; + os << v[s]; + if( s < (v.size()-1) ){ + os << " "; + } } os << "]"; return os; From 1eee94a80943740c90d5f79dbef8a52d4192042a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:36:49 -0400 Subject: [PATCH 06/32] Sorting real/im in read coalesced GPU ET --- Grid/lattice/Lattice.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index a3017198..28ea0294 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -37,6 +37,7 @@ Author: Peter Boyle #include #include //#include +#include #include #include #include From ed469898dc4b05ec6f9550f7f936b3de5b405e56 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:38:40 -0400 Subject: [PATCH 07/32] coalesced ET expressions --- Grid/lattice/Lattice_base.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 2d972970..3ad9f913 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -123,10 +123,9 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); - accelerator_for(ss,me.size(),1,{ - auto tmp = coalescedEval(ss,exprCopy); + accelerator_for(ss,me.size(),vobj::Nsimd(),{ + auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); - // me[ss]=tmp; }); me.ViewClose(); ExpressionViewClose(exprCopy); @@ -147,10 +146,9 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); - accelerator_for(ss,me.size(),1,{ - auto tmp = coalescedEval(ss,exprCopy); + accelerator_for(ss,me.size(),vobj::Nsimd(),{ + auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); - //me[ss]=tmp; }); me.ViewClose(); ExpressionViewClose(exprCopy); @@ -170,10 +168,9 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); auto me = View(AcceleratorWriteDiscard); - accelerator_for(ss,me.size(),1,{ - auto tmp = coalescedEval(ss,exprCopy); + accelerator_for(ss,me.size(),vobj::Nsimd(),{ + auto tmp = eval(ss,exprCopy); coalescedWrite(me[ss],tmp); - // me[ss]=tmp; }); me.ViewClose(); ExpressionViewClose(exprCopy); From 9522dcd61118ef6d98009a48cdd4cce4c53757d1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:40:29 -0400 Subject: [PATCH 08/32] Remove dead commented ouot coode --- Grid/lattice/Lattice_comparison.h | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/Grid/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h index 6a29be94..b9912639 100644 --- a/Grid/lattice/Lattice_comparison.h +++ b/Grid/lattice/Lattice_comparison.h @@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid); typedef iScalar vPredicate ; -/* -template accelerator_inline -vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) -{ - typename std::remove_const::type ret; - - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - - const int Nsimd = vobj::vector_type::Nsimd(); - - ExtractBuffer mask(Nsimd); - ExtractBuffer truevals(Nsimd); - ExtractBuffer falsevals(Nsimd); - - extract(iftrue, truevals); - extract(iffalse, falsevals); - extract(TensorRemove(predicate), mask); - - for (int s = 0; s < Nsimd; s++) { - if (mask[s]) falsevals[s] = truevals[s]; - } - - merge(ret, falsevals); - return ret; -} -*/ ////////////////////////////////////////////////////////////////////////// // compare lattice to lattice ////////////////////////////////////////////////////////////////////////// From 6c31b99f1f3624f93aeb5a22580969604daddb6f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:49:19 -0400 Subject: [PATCH 09/32] I knew coupling Eigen Tensor to Grid serialisation was a bad iddea. Now the complex is different on GPU creates probblems --- Grid/tensors/Tensor_traits.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 04d7343e..dfa3c043 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -190,6 +190,36 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision2; }; + +#ifdef GRID_CUDA + template<> struct GridTypeMapper > : public GridTypeMapper_Base { + typedef std::complex scalar_type; + typedef std::complex scalar_typeD; + typedef scalar_type vector_type; + typedef scalar_typeD vector_typeD; + typedef scalar_type tensor_reduced; + typedef scalar_type scalar_object; + typedef scalar_typeD scalar_objectD; + typedef scalar_type Complexified; + typedef RealF Realified; + typedef scalar_typeD DoublePrecision; + typedef scalar_typeD DoublePrecision2; + }; + template<> struct GridTypeMapper > : public GridTypeMapper_Base { + typedef std::complex scalar_type; + typedef std::complex scalar_typeD; + typedef scalar_type vector_type; + typedef scalar_typeD vector_typeD; + typedef scalar_type tensor_reduced; + typedef scalar_type scalar_object; + typedef scalar_typeD scalar_objectD; + typedef scalar_type Complexified; + typedef RealD Realified; + typedef scalar_typeD DoublePrecision; + typedef scalar_typeD DoublePrecision2; + }; +#endif + template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD2 scalar_type; typedef ComplexD2 scalar_typeD; From e14a84317d722808ef738b1789200a9b00a4f364 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:50:49 -0400 Subject: [PATCH 10/32] GPU math unary calls --- Grid/simd/Simd.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 37aee2ed..1dc86c1b 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -93,6 +93,11 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(std::pow(r,y) using std::abs; using std::pow; using std::sqrt; +using std::log; +using std::exp; +using std::sin; +using std::cos; + accelerator_inline RealF conjugate(const RealF & r){ return r; } accelerator_inline RealD conjugate(const RealD & r){ return r; } From 7d14a3c0866bd76b8f74076bb03262c58e38615a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:53:46 -0400 Subject: [PATCH 11/32] Where working --- Grid/lattice/Lattice_ET.h | 185 ++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 98 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index 24ec812c..c43844f8 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -43,17 +43,16 @@ NAMESPACE_BEGIN(Grid); // Predicated where support //////////////////////////////////////////////////// #ifdef GRID_SIMT +// drop to scalar in SIMT; cleaner in fact template accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse) { - // should drop to sccalar in SIMT - // typename std::remove_const::type ret; - // Integer mask = TensorRemove(predicate); - // ret = iffalse; - // if (TensorRemove(mask)) ret=iftrue; - return iftrue; + Integer mask = TensorRemove(predicate); + typename std::remove_const::type ret= iffalse; + if (mask) ret=iftrue; + return ret; } #else template @@ -98,65 +97,62 @@ struct getVectorType >{ //-- recursive evaluation of expressions; -- // handle leaves of syntax tree /////////////////////////////////////////////////// -template accelerator_inline +template::value&&!is_lattice_expr::value,sobj>::type * = nullptr> +accelerator_inline sobj eval(const uint64_t ss, const sobj &arg) { return arg; } template accelerator_inline -const lobj & eval(const uint64_t ss, const LatticeView &arg) +auto eval(const uint64_t ss, const LatticeView &arg) -> decltype(arg(ss)) { - return arg[ss]; + return arg(ss); } -template::value&&!is_lattice_expr::value,sobj>::type * = nullptr> -accelerator_inline -sobj coalescedEval(const uint64_t ss, const sobj &arg) +//////////////////////////////////////////// +//-- recursive evaluation of expressions; -- +// whole vector return, used only for expression return type inference +/////////////////////////////////////////////////// +template accelerator_inline +sobj vecEval(const uint64_t ss, const sobj &arg) { return arg; } -#ifdef GRID_SIMT -#warning device template accelerator_inline -typename lobj::scalar_object coalescedEval(const uint64_t ss, const LatticeView &arg) +const lobj & vecEval(const uint64_t ss, const LatticeView &arg) { - auto ret = arg(ss); - return ret; -} -#else -#warning host -template accelerator_inline -lobj coalescedEval(const uint64_t ss, const LatticeView &arg) -{ - // return coalescedRead(arg[ss]); return arg[ss]; } -#endif -/* -template accelerator_inline -typename lobj::scalar_type coalescedEval(const uint64_t ss, const Lattice &arg) -{ - assert(0); - return coalescedRead(arg[ss]); -} -*/ - -// What needs this? -// Cannot be legal on accelerator -// Comparison must convert -#if 1 -template accelerator_inline -const lobj & eval(const uint64_t ss, const Lattice &arg) -{ - assert(0); - auto view = arg.View(AcceleratorRead); - return view[ss]; -} -#endif /////////////////////////////////////////////////// // handle nodes in syntax tree- eval one operand +// vecEval needed (but never called as all expressions offloaded) to infer the return type +// in SIMT contexts of closure. +/////////////////////////////////////////////////// +template accelerator_inline +auto vecEval(const uint64_t ss, const LatticeUnaryExpression &expr) + -> decltype(expr.op.func( vecEval(ss, expr.arg1))) +{ + return expr.op.func( vecEval(ss, expr.arg1) ); +} +// vecEval two operands +template accelerator_inline +auto vecEval(const uint64_t ss, const LatticeBinaryExpression &expr) + -> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2))) +{ + return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) ); +} +// vecEval three operands +template accelerator_inline +auto vecEval(const uint64_t ss, const LatticeTrinaryExpression &expr) + -> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3))) +{ + return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)); +} + +/////////////////////////////////////////////////// +// handle nodes in syntax tree- eval one operand coalesced /////////////////////////////////////////////////// template accelerator_inline auto eval(const uint64_t ss, const LatticeUnaryExpression &expr) @@ -174,37 +170,31 @@ auto eval(const uint64_t ss, const LatticeBinaryExpression &expr) // eval three operands template accelerator_inline auto eval(const uint64_t ss, const LatticeTrinaryExpression &expr) - -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) + -> decltype(expr.op.func(eval(ss, expr.arg1), + eval(ss, expr.arg2), + eval(ss, expr.arg3))) { - return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); -} +#ifdef GRID_SIMT + // Handles Nsimd (vInteger) != Nsimd(ComplexD) + typedef decltype(vecEval(ss, expr.arg2)) rvobj; + typedef typename std::remove_reference::type vobj; -/////////////////////////////////////////////////// -// handle nodes in syntax tree- eval one operand coalesced -/////////////////////////////////////////////////// -template accelerator_inline -auto coalescedEval(const uint64_t ss, const LatticeUnaryExpression &expr) - -> decltype(expr.op.func( coalescedEval(ss, expr.arg1))) -{ - return expr.op.func( coalescedEval(ss, expr.arg1) ); -} -// eval two operands -template accelerator_inline -auto coalescedEval(const uint64_t ss, const LatticeBinaryExpression &expr) - -> decltype(expr.op.func( coalescedEval(ss,expr.arg1),coalescedEval(ss,expr.arg2))) -{ - return expr.op.func( coalescedEval(ss,expr.arg1), coalescedEval(ss,expr.arg2) ); -} -// eval three operands -template accelerator_inline -auto coalescedEval(const uint64_t ss, const LatticeTrinaryExpression &expr) - -> decltype(expr.op.func(coalescedEval(ss, expr.arg1), - coalescedEval(ss, expr.arg2), - coalescedEval(ss, expr.arg3))) -{ - return expr.op.func(coalescedEval(ss, expr.arg1), - coalescedEval(ss, expr.arg2), - coalescedEval(ss, expr.arg3)); + const int Nsimd = vobj::vector_type::Nsimd(); + + auto vpred = vecEval(ss,expr.arg1); + + ExtractBuffer mask(Nsimd); + extract(TensorRemove(vpred), mask); + + int s = acceleratorSIMTlane(Nsimd); + return expr.op.func(mask[s], + eval(ss, expr.arg2), + eval(ss, expr.arg3)); +#else + return expr.op.func(eval(ss, expr.arg1), + eval(ss, expr.arg2), + eval(ss, expr.arg3)); +#endif } ////////////////////////////////////////////////////////////////////////// @@ -302,7 +292,7 @@ template inline void ExpressionViewOpen(LatticeBinaryExpression &expr) { ExpressionViewOpen(expr.arg1); // recurse AST - ExpressionViewOpen(expr.arg2); // recurse AST + ExpressionViewOpen(expr.arg2); // rrecurse AST } template inline void ExpressionViewOpen(LatticeTrinaryExpression &expr) @@ -346,7 +336,6 @@ inline void ExpressionViewClose(LatticeTrinaryExpression &expr) // Unary operators and funcs //////////////////////////////////////////// #define GridUnopClass(name, ret) \ - template \ struct name { \ template static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \ }; @@ -359,8 +348,6 @@ GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); -GridUnopClass(UnaryReal, real(a)); -GridUnopClass(UnaryImag, imag(a)); GridUnopClass(UnaryToReal, toReal(a)); GridUnopClass(UnaryToComplex, toComplex(a)); GridUnopClass(UnaryTimesI, timesI(a)); @@ -379,7 +366,6 @@ GridUnopClass(UnaryExp, exp(a)); // Binary operators //////////////////////////////////////////// #define GridBinOpClass(name, combination) \ - template \ struct name { \ template \ static auto accelerator_inline \ @@ -403,7 +389,6 @@ GridBinOpClass(BinaryOrOr, lhs || rhs); // Trinary conditional op //////////////////////////////////////////////////// #define GridTrinOpClass(name, combination) \ - template \ struct name { \ template \ static auto accelerator_inline \ @@ -423,10 +408,9 @@ GridTrinOpClass(TrinaryWhere, //////////////////////////////////////////// // Operator syntactical glue //////////////////////////////////////////// - -#define GRID_UNOP(name) name -#define GRID_BINOP(name) name -#define GRID_TRINOP(name) name +#define GRID_UNOP(name) name +#define GRID_BINOP(name) name +#define GRID_TRINOP(name) name #define GRID_DEF_UNOP(op, name) \ template ::value||is_lattice_expr::value,T1>::type * = nullptr> \ @@ -478,8 +462,6 @@ GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); -GRID_DEF_UNOP(real, UnaryReal); -GRID_DEF_UNOP(imag, UnaryImag); GRID_DEF_UNOP(toReal, UnaryToReal); GRID_DEF_UNOP(toComplex, UnaryToComplex); GRID_DEF_UNOP(timesI, UnaryTimesI); @@ -512,29 +494,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere); ///////////////////////////////////////////////////////////// template auto closure(const LatticeUnaryExpression &expr) - -> Lattice + -> Lattice { - Lattice ret(expr); + Lattice ret(expr); return ret; } template auto closure(const LatticeBinaryExpression &expr) - -> Lattice + -> Lattice { - Lattice ret(expr); + Lattice ret(expr); return ret; } template auto closure(const LatticeTrinaryExpression &expr) - -> Lattice + -> Lattice { - Lattice ret(expr); + Lattice ret(expr); return ret; } +#define EXPRESSION_CLOSURE(function) \ + template::value,void>::type * = nullptr> \ + auto function(Expression &expr) -> decltype(function(closure(expr))) \ + { \ + return function(closure(expr)); \ + } + #undef GRID_UNOP #undef GRID_BINOP From b918744184c13cab718bcaeb021805a1281d616d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:54:46 -0400 Subject: [PATCH 12/32] Prettificatoin --- Grid/qcd/utils/SUnAdjoint.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h index 1d530373..18d6b875 100644 --- a/Grid/qcd/utils/SUnAdjoint.h +++ b/Grid/qcd/utils/SUnAdjoint.h @@ -39,7 +39,7 @@ public: typedef iSUnAdjointMatrix AMatrixF; typedef iSUnAdjointMatrix AMatrixD; - typedef iSUnAdjointMatrix vAMatrix; + typedef iSUnAdjointMatrix vAMatrix; typedef iSUnAdjointMatrix vAMatrixF; typedef iSUnAdjointMatrix vAMatrixD; @@ -47,14 +47,9 @@ public: typedef Lattice LatticeAdjMatrixF; typedef Lattice LatticeAdjMatrixD; - typedef Lattice >, Nd> > - LatticeAdjField; - typedef Lattice >, Nd> > - LatticeAdjFieldF; - typedef Lattice >, Nd> > - LatticeAdjFieldD; - - + typedef Lattice >, Nd> > LatticeAdjField; + typedef Lattice >, Nd> > LatticeAdjFieldF; + typedef Lattice >, Nd> > LatticeAdjFieldD; template @@ -128,7 +123,9 @@ public: } // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 ) - static void projectOnAlgebra(typename SU::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) { + static void projectOnAlgebra(typename SU::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) + { + conformable(h_out, in); h_out = Zero(); AMatrix iTa; @@ -136,7 +133,7 @@ public: for (int a = 0; a < Dimension; a++) { generator(a, iTa); - auto tmp = real(trace(iTa * in)) * coefficient; + LatticeComplex tmp = real(trace(iTa * in)) * coefficient; pokeColour(h_out, tmp, a); } } From 3d27708f0757d418fbb773dafb5d760a599080b2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 31 Aug 2020 23:55:49 -0400 Subject: [PATCH 13/32] Basic where test --- tests/core/Test_where.cc | 80 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tests/core/Test_where.cc diff --git a/tests/core/Test_where.cc b/tests/core/Test_where.cc new file mode 100644 index 00000000..050b711b --- /dev/null +++ b/tests/core/Test_where.cc @@ -0,0 +1,80 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_poisson_fft.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt_size ({N,4,4}); + std::vector simd_layout({vComplexD::Nsimd(),1,1}); + std::vector mpi_layout ({1,1,1}); + + int vol = 1; + int nd = latt_size.size(); + for(int d=0;d({45,12,81,9})); + gaussian(RNG,rn); + + RealD nn=norm2(rn); + for(int mu=0;mu Date: Mon, 31 Aug 2020 23:56:26 -0400 Subject: [PATCH 14/32] real and imag part not in ET --- Grid/lattice/Lattice_real_imag.h | 79 ++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 Grid/lattice/Lattice_real_imag.h diff --git a/Grid/lattice/Lattice_real_imag.h b/Grid/lattice/Lattice_real_imag.h new file mode 100644 index 00000000..003300cc --- /dev/null +++ b/Grid/lattice/Lattice_real_imag.h @@ -0,0 +1,79 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/lattice/Lattice_reality.h + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle +Author: neo + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#ifndef GRID_LATTICE_REAL_IMAG_H +#define GRID_LATTICE_REAL_IMAG_H + + +// FIXME .. this is the sector of the code +// I am most worried about the directions +// The choice of burying complex in the SIMD +// is making the use of "real" and "imag" very cumbersome + +NAMESPACE_BEGIN(Grid); + +template inline Lattice real(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard()=lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] =real(lhs_v[ss]); + }); + return ret; +}; +template inline Lattice imag(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard()=lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] =imag(lhs_v[ss]); + }); + return ret; +}; + +template::value,void>::type * = nullptr> + auto real(const Expression &expr) -> decltype(real(closure(expr))) +{ + return real(closure(expr)); +} +template::value,void>::type * = nullptr> + auto imag(const Expression &expr) -> decltype(imag(closure(expr))) +{ + return imag(closure(expr)); +} + +NAMESPACE_END(Grid); + +#endif From 8b74174d7419600cbff0ad364657bef3b9b5bed9 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 00:03:26 -0400 Subject: [PATCH 15/32] Eigen tensor serialisatiino happy undeer GPU. Regret agreeing to let us couple Eigen types to Grid IO --- tests/IO/Test_serialisation.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index 7f3e5bca..27fe589e 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -47,14 +47,14 @@ public: bool , b, std::vector, array, std::vector >, twodimarray, - std::vector > >, cmplx3darray, + std::vector> > >, cmplx3darray, SpinColourMatrix, scm ); myclass() {} myclass(int i) : array(4,5.1) , twodimarray(3,std::vector(5, 1.23456)) - , cmplx3darray(3,std::vector>(5, std::vector(7, Complex(1.2, 3.4)))) + , cmplx3darray(3,std::vector>>(5, std::vector>(7, std::complex(1.2, 3.4)))) , ve(2, myenum::blue) { e=myenum::red; @@ -121,11 +121,11 @@ namespace Eigen { // Perform I/O tests on a range of tensor types // Test coverage: scalars, complex and GridVectors in single, double and default precision class TensorIO : public Serializable { - using TestScalar = ComplexD; + using TestScalar = std::complex; using SR3 = Eigen::Sizes<9,4,2>; using SR5 = Eigen::Sizes<5,4,3,2,1>; using ESO = Eigen::StorageOptions; - using TensorRank3 = Eigen::Tensor; + using TensorRank3 = Eigen::Tensor, 3, ESO::RowMajor>; using TensorR5 = Eigen::TensorFixedSize; using TensorR5Alt = Eigen::TensorFixedSize; using Tensor942 = Eigen::TensorFixedSize; @@ -134,8 +134,8 @@ class TensorIO : public Serializable { using LSCTensor = Eigen::TensorFixedSize>; static const Real FlagR; - static const Complex Flag; - static const ComplexF FlagF; + static const std::complex Flag; + static const std::complex FlagF; static const TestScalar FlagTS; static const char * const pszFilePrefix; @@ -230,8 +230,8 @@ public: }; const Real TensorIO::FlagR {1}; -const Complex TensorIO::Flag {1,-1}; -const ComplexF TensorIO::FlagF {1,-1}; +const std::complex TensorIO::Flag {1,-1}; +const std::complex TensorIO::FlagF {1,-1}; const TensorIO::TestScalar TensorIO::FlagTS{1,-1}; const char * const TensorIO::pszFilePrefix = "tensor_"; From cbc995b74cf982cfcb1ef59991728490edff7198 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 00:12:54 -0400 Subject: [PATCH 16/32] Made better interface --- tests/core/Test_main.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index 08752a46..af8b747b 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -137,7 +137,6 @@ int main(int argc, char **argv) { LatticeReal iscalar(&Fine); SpinMatrix GammaFive; - iSpinMatrix iGammaFive; ColourMatrix cmat; random(FineRNG, Foo); @@ -283,7 +282,6 @@ int main(int argc, char **argv) { cMat = mydouble * cMat; sMat = adj(sMat); // LatticeSpinMatrix adjoint - sMat = iGammaFive * sMat; // SpinMatrix * LatticeSpinMatrix sMat = GammaFive * sMat; // SpinMatrix * LatticeSpinMatrix scMat = adj(scMat); cMat = adj(cMat); From 15ca8637f357bc080a75bcd5da46e856b4fb2122 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 00:13:32 -0400 Subject: [PATCH 17/32] No norms in HermOp --- tests/core/Test_staggered_naive.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc index f96bac93..9fe35a54 100644 --- a/tests/core/Test_staggered_naive.cc +++ b/tests/core/Test_staggered_naive.cc @@ -261,11 +261,11 @@ int main (int argc, char ** argv) pickCheckerboard(Odd ,phi_o,phi); SchurDiagMooeeOperator HermOpEO(Ds); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); From d982a5b6d598af63d4c51443ef173ee6b131ab4a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 00:14:04 -0400 Subject: [PATCH 18/32] Fix coaarsened --- tests/debug/Test_cayley_coarsen_support.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc index e6e18250..e91b3070 100644 --- a/tests/debug/Test_cayley_coarsen_support.cc +++ b/tests/debug/Test_cayley_coarsen_support.cc @@ -122,7 +122,7 @@ int main (int argc, char ** argv) typedef Aggregation Subspace; Subspace Aggregates(Coarse5d,FGrid,cb); - Aggregates.CreateSubspaceRandom(RNG5); + // Aggregates.CreateSubspaceRandom(RNG5); subspace=Aggregates.subspace; @@ -163,7 +163,7 @@ int main (int argc, char ** argv) LittleDiracOp.M(c_src,c_res); std::cout< Date: Tue, 1 Sep 2020 00:14:33 -0400 Subject: [PATCH 19/32] little worry large Nbasis doesnt compile GPU --- tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc index 0b1c3a1f..d9249e0d 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc @@ -311,7 +311,7 @@ int main (int argc, char ** argv) { std::cout << GridLogMessage << "Keep " << coarse.Nk << " total vectors" << std::endl; assert(Nm2 >= Nm1); - const int nbasis= 70; + const int nbasis= 32; CoarseFineIRL IRL(FrbGrid,CoarseGrid5rb,HermOp,Odd); std::cout << GridLogMessage << "Constructed CoarseFine IRL" << std::endl; From c273fb051cdb7c739f4eba1ad6a15e203f1fcab0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 15:27:59 -0400 Subject: [PATCH 20/32] Peek poke laattice --- Grid/lattice/Lattice_peekpoke.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index c79becf2..5caab214 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -182,6 +182,14 @@ inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) return; }; +template +inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site) +{ + autoView(lv,l,CpuRead); + peekLocalSite(s,lv,site); + return; +}; + // Must be CPU write view template inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) @@ -210,6 +218,14 @@ inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) return; }; +template +inline void pokeLocalSite(const sobj &s, Lattice &l,Coordinate &site) +{ + autoView(lv,l,CpuWrite); + pokeLocalSite(s,lv,site); + return; +}; + NAMESPACE_END(Grid); #endif From 5791021dcd07d619dde942ba3777bca4cf14378c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 15:28:15 -0400 Subject: [PATCH 21/32] Speed up Cshift more with coalesced --- Grid/cshift/Cshift.h | 19 ++----------------- Grid/cshift/Cshift_common.h | 12 ++++++------ 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h index 9150579c..c7b9e3cb 100644 --- a/Grid/cshift/Cshift.h +++ b/Grid/cshift/Cshift.h @@ -52,23 +52,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -template -auto Cshift(const LatticeUnaryExpression &expr,int dim,int shift) - -> Lattice -{ - return Cshift(closure(expr),dim,shift); -} -template -auto Cshift(const LatticeBinaryExpression &expr,int dim,int shift) - -> Lattice -{ - return Cshift(closure(expr),dim,shift); -} -template -auto Cshift(const LatticeTrinaryExpression &expr,int dim,int shift) - -> Lattice +template::value,void>::type * = nullptr> +auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) { return Cshift(closure(expr),dim,shift); } diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index d296c024..723b88be 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -76,8 +76,8 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen autoView(rhs_v , rhs, AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; - accelerator_for(i,ent,1,{ - buffer_p[table[i].first]=rhs_v[table[i].second]; + accelerator_for(i,ent,vobj::Nsimd(),{ + coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); } } @@ -185,8 +185,8 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Copy_plane(Lattice& lhs,const Lattice &rhs autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); auto table = &Cshift_table[0]; - accelerator_for(i,ent,1,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; + accelerator_for(i,ent,vobj::Nsimd(),{ + coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); } } From 8807d998bc7c12ac5c02c1e5dcf1cf1d1bdfb34a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 15:29:11 -0400 Subject: [PATCH 22/32] closure improved --- Grid/qcd/action/gauge/GaugeImplementations.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index 19bc5aa6..a14aec1b 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,7 +59,7 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(closure(adj(Link)), mu, -1); + return Cshift(adj(Link), mu, -1); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { From 1654c4f3c0ffec577833f3864aad8fedbce2debe Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 15:29:45 -0400 Subject: [PATCH 23/32] Closure improved --- Grid/qcd/utils/WilsonLoops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index fdd53698..0367c9fa 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -485,7 +485,7 @@ public: // Up staple ___ ___ // | | - tmp = Cshift(closure(adj(U[nu])), nu, -1); + tmp = Cshift(adj(U[nu]), nu, -1); tmp = adj(U2[mu]) * tmp; tmp = Cshift(tmp, mu, -2); @@ -519,7 +519,7 @@ public: // // | | - tmp = Cshift(closure(adj(U2[nu])), nu, -2); + tmp = Cshift(adj(U2[nu]), nu, -2); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = U2[nu] * Cshift(tmp, nu, 2); Stap += Cshift(tmp, mu, 1); From eac1f08b7b1412eef6be9b53d86a552fc0e427e4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Sep 2020 15:30:33 -0400 Subject: [PATCH 24/32] Close expressions passed as an argument --- Grid/qcd/utils/CovariantCshift.h | 49 ++++++++++++++------------------ 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 6ac69150..cee1fa12 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -53,23 +53,21 @@ namespace PeriodicBC { return Cshift(tmp,mu,-1);// moves towards positive mu } - template auto - CovShiftForward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftForward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); + auto arg = closure(expr); return CovShiftForward(Link,mu,arg); } - template auto - CovShiftBackward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftBackward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); - return CovShiftForward(Link,mu,arg); + auto arg = closure(expr); + return CovShiftBackward(Link,mu,arg); } } @@ -142,26 +140,23 @@ namespace ConjugateBC { return Cshift(tmp,mu,-1);// moves towards positive mu } - template auto - CovShiftForward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftForward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); + auto arg = closure(expr); return CovShiftForward(Link,mu,arg); } - template auto - CovShiftBackward(const Lattice &Link, - int mu, - const LatticeUnaryExpression &expr) - -> Lattice + template::value,void>::type * = nullptr> + auto CovShiftBackward(const Lattice &Link, + int mu, + const Expr &expr) -> decltype(closure(expr)) { - Lattice arg(expr); - return CovShiftForward(Link,mu,arg); + auto arg = closure(expr); + return CovShiftBackward(Link,mu,arg); } - } From d3ce60713d87dd1baf7d9f7acf1097da1c932d1a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:44:13 -0400 Subject: [PATCH 25/32] UVM, Device and Lattice/aligned allocators --- Grid/allocator/AlignedAllocator.h | 57 +++++++++++++++++++++++++++---- Grid/allocator/MemoryManager.h | 4 +-- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index ebb3162b..249732fb 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -65,8 +65,7 @@ public: MemoryManager::CpuFree((void *)__p,bytes); } - // FIXME: hack for the copy constructor, eventually it must be avoided - //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; + // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p) { }; void destroy(pointer __p) { }; @@ -74,6 +73,9 @@ public: template inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } +////////////////////////////////////////////////////////////////////////////////////// +// Unified virtual memory +////////////////////////////////////////////////////////////////////////////////////// template class uvmAllocator { public: @@ -109,22 +111,63 @@ public: MemoryManager::SharedFree((void *)__p,bytes); } - // FIXME: hack for the copy constructor, eventually it must be avoided void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; - //void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p) { }; void destroy(pointer __p) { }; }; template inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; } template inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; } +//////////////////////////////////////////////////////////////////////////////// +// Device memory +//////////////////////////////////////////////////////////////////////////////// +template +class devAllocator { +public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef _Tp* pointer; + typedef const _Tp* const_pointer; + typedef _Tp& reference; + typedef const _Tp& const_reference; + typedef _Tp value_type; + + template struct rebind { typedef devAllocator<_Tp1> other; }; + devAllocator() throw() { } + devAllocator(const devAllocator&) throw() { } + template devAllocator(const devAllocator<_Tp1>&) throw() { } + ~devAllocator() throw() { } + pointer address(reference __x) const { return &__x; } + size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } + + pointer allocate(size_type __n, const void* _p= 0) + { + size_type bytes = __n*sizeof(_Tp); + profilerAllocate(bytes); + _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); + assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); + return ptr; + } + + void deallocate(pointer __p, size_type __n) + { + size_type bytes = __n * sizeof(_Tp); + profilerFree(bytes); + MemoryManager::AcceleratorFree((void *)__p,bytes); + } + void construct(pointer __p, const _Tp& __val) { }; + void construct(pointer __p) { }; + void destroy(pointer __p) { }; +}; +template inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; } +template inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; } + //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -template using commAllocator = uvmAllocator; +//template using commAllocator = devAllocator; template using Vector = std::vector >; -template using commVector = std::vector >; -//template using Matrix = std::vector > >; +template using commVector = std::vector >; NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 23065c58..aac13aee 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -93,12 +93,12 @@ private: static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; - static void *AcceleratorAllocate(size_t bytes); - static void AcceleratorFree (void *ptr,size_t bytes); static void PrintBytes(void); public: static void Init(void); static void InitMessage(void); + static void *AcceleratorAllocate(size_t bytes); + static void AcceleratorFree (void *ptr,size_t bytes); static void *SharedAllocate(size_t bytes); static void SharedFree (void *ptr,size_t bytes); static void *CpuAllocate(size_t bytes); From 0c3095e173bc2886b121edb3fafa78646948f5e8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:45:35 -0400 Subject: [PATCH 26/32] Comms buffers to device memory --- Grid/communicator/Communicator_mpi3.cc | 88 +++++++++----------------- 1 file changed, 31 insertions(+), 57 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 6130195d..28c7b8a4 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -302,60 +302,35 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int bytes) { std::vector reqs(0); - // unsigned long xcrc = crc32(0L, Z_NULL, 0); - // unsigned long rcrc = crc32(0L, Z_NULL, 0); - // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); - SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromComplete(reqs); - // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); - // printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc); -} -void CartesianCommunicator::SendRecvPacket(void *xmit, - void *recv, - int sender, - int receiver, - int bytes) -{ - MPI_Status stat; - assert(sender != receiver); - int tag = sender; - if ( _processor == sender ) { - MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); - } - if ( _processor == receiver ) { - MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); - } -} -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ + unsigned long xcrc = crc32(0L, Z_NULL, 0); + unsigned long rcrc = crc32(0L, Z_NULL, 0); + int myrank = _processor; int ierr; - if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { - MPI_Request xrq; - MPI_Request rrq; + // Enforce no UVM in comms, device or host OK + int uvm; + auto + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit); + assert(cuerr == cudaSuccess ); + assert(uvm==0); - ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); - ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv); + assert(cuerr == cudaSuccess ); + assert(uvm==0); - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); - } else { - // Give the CPU to MPI immediately; can use threads to overlap optionally - ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, - recv,bytes,MPI_CHAR,from, from, - communicator,MPI_STATUS_IGNORE); - assert(ierr==0); - } + // Give the CPU to MPI immediately; can use threads to overlap optionally + // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); + ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, + recv,bytes,MPI_CHAR,from, from, + communicator,MPI_STATUS_IGNORE); + assert(ierr==0); + + // xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); + // rcrc = crc32(rcrc,(unsigned char *)recv,bytes); + // printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush } - +// Basic Halo comms primitive double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, int dest, void *recv, @@ -411,15 +386,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector &waitall,int dir) -{ - SendToRecvFromComplete(waitall); -} -void CartesianCommunicator::StencilBarrier(void) -{ - MPI_Barrier (ShmComm); -} -void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) { int nreq=list.size(); @@ -430,6 +397,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector & assert(ierr==0); list.resize(0); } +void CartesianCommunicator::StencilBarrier(void) +{ + MPI_Barrier (ShmComm); +} +//void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) +//{ +//} void CartesianCommunicator::Barrier(void) { int ierr = MPI_Barrier(communicator); From b4255140d611f3cbb1ccf6148450f14f5e0539cb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:47:46 -0400 Subject: [PATCH 27/32] Stale data member eliminated --- Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h | 2 +- Grid/qcd/action/fermion/WilsonFermion5D.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 625eda63..ca660610 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -208,7 +208,7 @@ public: LebesgueOrder LebesgueEvenOdd; // Comms buffer - std::vector > comm_buf; + // std::vector > comm_buf; /////////////////////////////////////////////////////////////// // Conserved current utilities diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 804b1d10..80231bb4 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -215,7 +215,7 @@ public: LebesgueOrder LebesgueEvenOdd; // Comms buffer - std::vector > comm_buf; + // std::vector > comm_buf; }; From 85b1c5df39c33e3de9d3ab2917400b9c985428c8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 15:48:16 -0400 Subject: [PATCH 28/32] A never hit case that is not 100% confident is asserted for safety --- Grid/cshift/Cshift_common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index d296c024..40ab5032 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -222,6 +222,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"< Date: Thu, 3 Sep 2020 15:49:13 -0400 Subject: [PATCH 29/32] Include cuda.h --- Grid/threads/Accelerator.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 74a3ea22..29d12904 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -90,6 +90,7 @@ void acceleratorInit(void); ////////////////////////////////////////////// #ifdef GRID_CUDA +#include #ifdef __CUDA_ARCH__ #define GRID_SIMT From 8244caff2507eab0a99a59f275bd1745efa7336d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 18:52:55 -0400 Subject: [PATCH 30/32] Remove the asynchronous non-Stencil calls. --- benchmarks/Benchmark_comms.cc | 108 ++++------------------------------ 1 file changed, 10 insertions(+), 98 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 44ccbd19..232030c8 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -74,90 +74,6 @@ int main (int argc, char ** argv) std::vector t_time(Nloop); time_statistics timestat; - std::cout< > xbuf(8); - std::vector > rbuf(8); - - int ncomm; - int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); - for(int mu=0;mu<8;mu++){ - xbuf[mu].resize(lat*lat*lat*Ls); - rbuf[mu].resize(lat*lat*lat*Ls); - // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < requests; - - ncomm=0; - for(int mu=0;mu<4;mu++){ - - if (mpi_layout[mu]>1 ) { - - ncomm++; - int comm_proc=1; - int xmit_to_rank; - int recv_from_rank; - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - - comm_proc = mpi_layout[mu]-1; - - Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - - } - } - Grid.SendToRecvFromComplete(requests); - Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds - } - - timestat.statistics(t_time); - - double dbytes = bytes*ppn; - double xbytes = dbytes*2.0*ncomm; - double rbytes = xbytes; - double bidibytes = xbytes+rbytes; - - std::cout< requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu][0], - xmit_to_rank, - (void *)&rbuf[mu][0], - recv_from_rank, - bytes); - Grid.SendToRecvFromComplete(requests); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); } comm_proc = mpi_layout[mu]-1; { std::vector requests; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); - Grid.SendToRecvFromBegin(requests, - (void *)&xbuf[mu+4][0], - xmit_to_rank, - (void *)&rbuf[mu+4][0], - recv_from_rank, - bytes); - Grid.SendToRecvFromComplete(requests); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); } } } From a8309638d4be9faf49619a8220a9b06a4e16eb73 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 20:29:26 -0400 Subject: [PATCH 31/32] UVM check in MPI calls --- Grid/communicator/Communicator_mpi3.cc | 11 ++--------- Grid/threads/Accelerator.h | 22 ++++++++++++++++++++++ configure.ac | 5 ++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 28c7b8a4..83f71233 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -309,15 +309,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, int ierr; // Enforce no UVM in comms, device or host OK - int uvm; - auto - cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) xmit); - assert(cuerr == cudaSuccess ); - assert(uvm==0); - - cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) recv); - assert(cuerr == cudaSuccess ); - assert(uvm==0); + assert(acceleratorIsCommunicable(xmit)); + assert(acceleratorIsCommunicable(recv)); // Give the CPU to MPI immediately; can use threads to overlap optionally // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 29d12904..1a3dfdc2 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -70,6 +70,7 @@ NAMESPACE_BEGIN(Grid); // // Memory management: // +// int acceleratorIsCommunicable(void *pointer); // void *acceleratorAllocShared(size_t bytes); // void acceleratorFreeShared(void *ptr); // @@ -166,6 +167,16 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline int acceleratorIsCommunicable(void *ptr) +{ + int uvm; + auto + cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + assert(cuerr == cudaSuccess ); + if(uvm) return 0; + else return 1; +} + #endif ////////////////////////////////////////////// @@ -220,6 +231,15 @@ inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline int acceleratorIsCommunicable(void *ptr) +{ +#if 0 + auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); + if ( uvm = cl::sycl::usm::alloc::shared ) return 1; + else return 0; +#endif + return 1; +} #endif @@ -299,6 +319,7 @@ inline void *acceleratorAllocShared(size_t bytes) return malloc(bytes); #endif }; +inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void *acceleratorAllocDevice(size_t bytes) { @@ -353,6 +374,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA spec inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline int acceleratorIsCommunicable(void *ptr){ return 1; } #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; diff --git a/configure.ac b/configure.ac index f0bea6a4..5113757c 100644 --- a/configure.ac +++ b/configure.ac @@ -154,6 +154,7 @@ AC_ARG_ENABLE([accelerator], case ${ac_ACCELERATOR} in cuda) echo CUDA acceleration + LIBS="${LIBS} -lcuda" AC_DEFINE([GRID_CUDA],[1],[Use CUDA offload]);; sycl) echo SYCL acceleration @@ -323,7 +324,6 @@ case ${CXXTEST} in # CXXLD="nvcc -v -link" CXX="${CXXBASE} -x cu " CXXLD="${CXXBASE} -link" -# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" @@ -483,8 +483,7 @@ case ${ac_SHM} in LDFLAGS_CPY=$LDFLAGS CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" LDFLAGS="$AM_LDFLAGS $LDFLAGS" - AC_SEARCH_LIBS([shm_unlink], [rt], [], - [AC_MSG_ERROR("no library found for shm_unlink")]) + AC_SEARCH_LIBS([shm_unlink], [rt], [],[AC_MSG_ERROR("no library found for shm_unlink")]) CXXFLAGS=$CXXFLAGS_CPY LDFLAGS=$LDFLAGS_CPY ;; From 65b724bb5fbacadf4063278c86178586696fe866 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Sep 2020 21:46:43 -0400 Subject: [PATCH 32/32] 2 level hddcr --- tests/solver/Test_dwf_hdcr_2level.cc | 420 +++++++++++++++++++++++++++ 1 file changed, 420 insertions(+) create mode 100644 tests/solver/Test_dwf_hdcr_2level.cc diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc new file mode 100644 index 00000000..df24c9d2 --- /dev/null +++ b/tests/solver/Test_dwf_hdcr_2level.cc @@ -0,0 +1,420 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; +template class RedBlackSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + RealD tol; + RealD shift; + int maxit; + + RedBlackSmoother(RealD _shift,RealD _tol,int _maxit,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + std::cout << " Red Black Smootheer "< CG(tol,maxit,false); + out =Zero(); + SchurRedBlackDiagMooeeSolve RBSolver(CG); + RBSolver(SmootherMatrix,in,out); + std::cout << " Red Black Smootheer "< +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother1; + FineSmoother & _Smoother2; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + const int nbasis= 32; + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,100,0.0);// 18s + // rAggregates.CreateSubspaceChebyshev(RNG5,rHermDefOp,nb,60.0,0.05,500,200,150,0.0);// 15.7 23iter + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);// + // pad out the rAggregates. + + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,150,0.0);// 19s + + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,200,0.0); 15.2s + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,500,200,0.0); 16.3s + + for(int n=0;n Level1Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + std::cout< IRLHermOp(LDOp); + Chebyshev IRLCheby(0.002,12.,151); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=48; + int Nm=64; + int Nstop=48; + int Nconv; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + CoarseVector c_src(Coarse5d); + gaussian(CRNG,c_src); + IRL.calc(eval,evec,c_src,Nconv); + + // ConjugateGradient CoarseCG(0.01,1000); + + ConjugateGradient CoarseCG(0.02,1000);// 14.7s + DeflatedGuesser DeflCoarseGuesser(evec,eval); + NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + c_src=1.0; + std::cout< PM; PM(HermDefOp,src); + std::cout< PosdefLdop(LDOp); + PowerMethod cPM; cPM(PosdefLdop,c_src); + + std::cout< , NormalEquations > TwoLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 72 iter 63s + // ChebyshevSmoother FineSmoother(0.1,60.0,20,HermIndefOp,Ddwf); // 66 iter 69s + // ChebyshevSmoother FineSmoother(0.5,60.0,20,HermIndefOp,Ddwf); // 63 iter 65 s + // ChebyshevSmoother FineSmoother(1.0,60.0,20,HermIndefOp,Ddwf); // 69, 70 + // ChebyshevSmoother FineSmoother(1.0,60.0,14,HermIndefOp,Ddwf); // 77 + + // ChebyshevSmoother FineSmoother(0.5,60.0,10,HermIndefOp,Ddwf); // 23 iter 15.9s + // ChebyshevSmoother FineSmoother(0.5,60.0,14,HermIndefOp,Ddwf); // 20, 16.9s + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); // 21, 15.6s + + // MirsSmoother FineCGSmoother(0.05,0.01,20,HermIndefOp,Ddwf); + // RedBlackSmoother FineRBSmoother(0.00,0.001,100,Ddwf); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + DeflCoarseGuesser, + DeflCoarseCGNE); + TwoLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + + std::cout< FineCG(1.0e-8,10000); + SchurDiagMooeeOperator FineDiagMooee(Ddwf); // M_ee - Meo Moo^-1 Moe + LatticeFermion f_src_e(FrbGrid); f_src_e=1.0; + LatticeFermion f_res_e(FrbGrid); f_res_e=Zero(); + FineCG(FineDiagMooee,f_src_e,f_res_e); + + std::cout<