From ba5dc670a53d3cb0cf140d905bc5b8e97512880c Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 22 Dec 2020 10:10:07 -0500 Subject: [PATCH] Reimplemented GparityWilsonImpl::InsertForce5D to run efficiently on GPUs Swapped order of templated tensor code and c-number specializations in Tensor_outer.h to fix compile issue with type deduction on Summit --- Grid/qcd/action/fermion/GparityWilsonImpl.h | 50 +++++++++++++-------- Grid/tensors/Tensor_outer.h | 21 ++++----- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index dd5801a9..f8ae7a9f 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -30,7 +30,6 @@ directory NAMESPACE_BEGIN(Grid); - /* Policy implementation for G-parity boundary conditions @@ -358,28 +357,41 @@ public: inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds){ assert(0); } - + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã, int mu) { - - int Ls = Btilde.Grid()->_fdimensions[0]; - - GaugeLinkField tmp(mat.Grid()); - tmp = Zero(); + int Ls=Btilde.Grid()->_fdimensions[0]; + { - autoView( tmp_v , tmp, CpuWrite); - autoView( Atilde_v , Atilde, CpuRead); - autoView( Btilde_v , Btilde, CpuRead); - thread_for(ss,tmp.Grid()->oSites(),{ - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); - tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - }); + autoView( mat_v , mat, AcceleratorWrite); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{ + int sU=sss; + typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; + ColorMatrixType sum; + zeroit(sum); + for(int s=0;s(mat, tmp, mu); - return; } + + + + }; diff --git a/Grid/tensors/Tensor_outer.h b/Grid/tensors/Tensor_outer.h index 4902c22f..0fad84b1 100644 --- a/Grid/tensors/Tensor_outer.h +++ b/Grid/tensors/Tensor_outer.h @@ -35,6 +35,17 @@ NAMESPACE_BEGIN(Grid); // Vector x Vector -> Matrix /////////////////////////////////////////////////////////////////////////////////////// +template = 0> +accelerator_inline CC outerProduct(const CC &l, const CC& r) +{ + return l*conj(r); +} +template = 0> +accelerator_inline RR outerProduct(const RR &l, const RR& r) +{ + return l*r; +} + template accelerator_inline auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix { @@ -57,16 +68,6 @@ auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar = 0> -accelerator_inline CC outerProduct(const CC &l, const CC& r) -{ - return l*conj(r); -} -template = 0> -accelerator_inline RR outerProduct(const RR &l, const RR& r) -{ - return l*r; -} NAMESPACE_END(Grid);