From 715babeac8ea251bf5764bb2027a87148d6b7649 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Jan 2019 13:53:37 +0000 Subject: [PATCH] GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired. Local reduction to LatticeComplex and then further reduction. --- Grid/lattice/Lattice_comparison_utils.h | 42 ++++---- Grid/lattice/Lattice_overload.h | 138 ------------------------ Grid/lattice/Lattice_reduction.h | 65 +++++++++-- Grid/lattice/Lattice_transfer.h | 8 +- 4 files changed, 82 insertions(+), 171 deletions(-) delete mode 100644 Grid/lattice/Lattice_overload.h diff --git a/Grid/lattice/Lattice_comparison_utils.h b/Grid/lattice/Lattice_comparison_utils.h index f7091977..431aa9e1 100644 --- a/Grid/lattice/Lattice_comparison_utils.h +++ b/Grid/lattice/Lattice_comparison_utils.h @@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid); // template class veq { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) == (rhs); } }; template class vne { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) != (rhs); } }; template class vlt { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) < (rhs); } }; template class vle { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) <= (rhs); } }; template class vgt { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) > (rhs); } }; template class vge { public: - vInteger operator()(const lobj &lhs, const robj &rhs) + accelerator vInteger operator()(const lobj &lhs, const robj &rhs) { return (lhs) >= (rhs); } @@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid); // Generic list of functors template class seq { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) == (rhs); } }; template class sne { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) != (rhs); } }; template class slt { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) < (rhs); } }; template class sle { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) <= (rhs); } }; template class sgt { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) > (rhs); } }; template class sge { public: - Integer operator()(const lobj &lhs, const robj &rhs) + accelerator Integer operator()(const lobj &lhs, const robj &rhs) { return (lhs) >= (rhs); } @@ -133,7 +133,7 @@ NAMESPACE_BEGIN(Grid); // Integer and real get extra relational functions. ////////////////////////////////////////////////////////////////////////////////////////////////////// template = 0> - inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) + accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) { typedef typename vsimd::scalar_type scalar; ExtractBuffer vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation @@ -150,7 +150,7 @@ NAMESPACE_BEGIN(Grid); } template = 0> - inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) + accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) { typedef typename vsimd::scalar_type scalar; ExtractBuffer vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation @@ -165,7 +165,7 @@ NAMESPACE_BEGIN(Grid); } template = 0> - inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) + accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) { typedef typename vsimd::scalar_type scalar; ExtractBuffer vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation @@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid); #define DECLARE_RELATIONAL_EQ(op,functor) \ template = 0>\ - inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ + accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ {\ typedef typename vsimd::scalar_type scalar;\ return Comparison(functor(),lhs,rhs);\ }\ template = 0>\ - inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ + accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ {\ typedef typename vsimd::scalar_type scalar;\ return Comparison(functor(),lhs,rhs);\ }\ template = 0>\ - inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ + accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ {\ typedef typename vsimd::scalar_type scalar;\ return Comparison(functor(),lhs,rhs);\ }\ template\ - inline vInteger operator op(const iScalar &lhs,const typename vsimd::scalar_type &rhs) \ + accelerator_inline vInteger operator op(const iScalar &lhs,const typename vsimd::scalar_type &rhs) \ { \ return lhs._internal op rhs; \ } \ template\ - inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar &rhs) \ + accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar &rhs) \ { \ return lhs op rhs._internal; \ } \ @@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid); #define DECLARE_RELATIONAL(op,functor) \ DECLARE_RELATIONAL_EQ(op,functor) \ template\ - inline vInteger operator op(const iScalar &lhs,const iScalar &rhs)\ + accelerator_inline vInteger operator op(const iScalar &lhs,const iScalar &rhs)\ { \ return lhs._internal op rhs._internal; \ } diff --git a/Grid/lattice/Lattice_overload.h b/Grid/lattice/Lattice_overload.h deleted file mode 100644 index 0906b610..00000000 --- a/Grid/lattice/Lattice_overload.h +++ /dev/null @@ -1,138 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/lattice/Lattice_overload.h - - Copyright (C) 2015 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -#ifndef GRID_LATTICE_OVERLOAD_H -#define GRID_LATTICE_OVERLOAD_H - -namespace Grid { - - ////////////////////////////////////////////////////////////////////////////////////////////////////// - // unary negation - ////////////////////////////////////////////////////////////////////////////////////////////////////// - template - inline Lattice operator -(const Lattice &r) - { - Lattice ret(r._grid); - parallel_for(int ss=0;ssoSites();ss++){ - vstream(ret._odata[ss], -r._odata[ss]); - } - return ret; - } - ///////////////////////////////////////////////////////////////////////////////////// - // Lattice BinOp Lattice, - //NB mult performs conformable check. Do not reapply here for performance. - ///////////////////////////////////////////////////////////////////////////////////// - template - inline auto operator * (const Lattice &lhs,const Lattice &rhs)-> Lattice - { - Lattice ret(rhs._grid); - mult(ret,lhs,rhs); - return ret; - } - template - inline auto operator + (const Lattice &lhs,const Lattice &rhs)-> Lattice - { - Lattice ret(rhs._grid); - add(ret,lhs,rhs); - return ret; - } - template - inline auto operator - (const Lattice &lhs,const Lattice &rhs)-> Lattice - { - Lattice ret(rhs._grid); - sub(ret,lhs,rhs); - return ret; - } - - // Scalar BinOp Lattice ;generate return type - template - inline auto operator * (const left &lhs,const Lattice &rhs) -> Lattice - { - Lattice ret(rhs._grid); - parallel_for(int ss=0;ssoSites(); ss++){ - decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; - vstream(ret._odata[ss],tmp); - // ret._odata[ss]=lhs*rhs._odata[ss]; - } - return ret; - } - template - inline auto operator + (const left &lhs,const Lattice &rhs) -> Lattice - { - Lattice ret(rhs._grid); - parallel_for(int ss=0;ssoSites(); ss++){ - decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss]; - vstream(ret._odata[ss],tmp); - // ret._odata[ss]=lhs+rhs._odata[ss]; - } - return ret; - } - template - inline auto operator - (const left &lhs,const Lattice &rhs) -> Lattice - { - Lattice ret(rhs._grid); - parallel_for(int ss=0;ssoSites(); ss++){ - decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss]; - vstream(ret._odata[ss],tmp); - } - return ret; - } - template - inline auto operator * (const Lattice &lhs,const right &rhs) -> Lattice - { - Lattice ret(lhs._grid); - parallel_for(int ss=0;ssoSites(); ss++){ - decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs; - vstream(ret._odata[ss],tmp); - // ret._odata[ss]=lhs._odata[ss]*rhs; - } - return ret; - } - template - inline auto operator + (const Lattice &lhs,const right &rhs) -> Lattice - { - Lattice ret(lhs._grid); - parallel_for(int ss=0;ssoSites(); ss++){ - decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; - vstream(ret._odata[ss],tmp); - // ret._odata[ss]=lhs._odata[ss]+rhs; - } - return ret; - } - template - inline auto operator - (const Lattice &lhs,const right &rhs) -> Lattice - { - Lattice ret(lhs._grid); - parallel_for(int ss=0;ssoSites(); ss++){ - decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs; - vstream(ret._odata[ss],tmp); - // ret._odata[ss]=lhs._odata[ss]-rhs; - } - return ret; - } -} -#endif diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 3f0b70d9..f8b63378 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -22,6 +22,14 @@ Author: paboyle #pragma once #include +#ifdef GRID_NVCC +#include +#include +#include +#include +#include +#include +#endif NAMESPACE_BEGIN(Grid); @@ -33,6 +41,25 @@ template inline RealD norm2(const Lattice &arg){ return real(nrm); } +#ifdef GRID_NVCC +//#warning "ThrustReduce compiled" +//#include +template +vobj ThrustNorm(const Lattice &lat) +{ + typedef typename vobj::scalar_type scalar_type; + auto lat_v=lat.View(); + Integer s0=0; + Integer sN=lat_v.end(); + scalar_type sum = 0; + scalar_type * begin = (scalar_type *)&lat_v[s0]; + scalar_type * end = (scalar_type *)&lat_v[sN]; + thrust::reduce(begin,end,sum); + std::cout <<" thrust::reduce sum "<< sum << std::endl; + return sum; +} +#endif + // Double inner product template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) @@ -47,7 +74,26 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ auto left_v = left.View(); auto right_v=right.View(); +#ifdef GRID_NVCC + //#if 0 + typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t; + + Lattice inner_tmp(grid); + + ///////////////////////// + // localInnerProduct + ///////////////////////// + auto inner_tmp_v = inner_tmp.View(); + accelerator_loop(ss,left_v,{ + inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss])); + }); + ///////////////////////// + // and site sum the scalars + ///////////////////////// + inner_t vnrm = ThrustNorm(inner_tmp); + auto vvnrm = vnrm; +#else thread_loop( (int thr=0;thrSumArraySize();thr++),{ int mywork, myoff; GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff); @@ -63,6 +109,7 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ for(int i=0;iSumArraySize();i++){ vvnrm = vvnrm+sumarray[i]; } +#endif nrm = Reduce(vvnrm);// sum across simd right.Grid()->GlobalSum(nrm); return nrm; @@ -102,7 +149,8 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt thread_loop( (int thr=0;thrSumArraySize();thr++), { int nwork, mywork, myoff; - GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff); + nwork = x.Grid()->oSites(); + GridThread::GetWork(nwork,thr,mywork,myoff); // private to thread; sub summation decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); @@ -162,7 +210,8 @@ inline typename vobj::scalar_object sum(const Lattice &arg) auto arg_v=arg.View(); thread_loop( (int thr=0;thrSumArraySize();thr++),{ int nwork, mywork, myoff; - GridThread::GetWork(grid->oSites(),thr,mywork,myoff); + nwork = grid->oSites(); + GridThread::GetWork(nwork,thr,mywork,myoff); vobj vvsum=Zero(); for(int ss=myoff;ss &R,Eigen::MatrixXcd &aa,const Lattice // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); - int nh = FullGrid->_ndimension; + // int nh = FullGrid->_ndimension; // int nl = SliceGrid->_ndimension; - int nl = nh-1; + // int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -629,9 +678,9 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< // Lattice Rslice(SliceGrid); assert( FullGrid->_simd_layout[Orthog]==1); - int nh = FullGrid->_ndimension; + // int nh = FullGrid->_ndimension; // int nl = SliceGrid->_ndimension; - int nl=1; + // int nl=1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" @@ -685,9 +734,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); assert( FullGrid->_simd_layout[Orthog]==1); - int nh = FullGrid->_ndimension; + // int nh = FullGrid->_ndimension; // int nl = SliceGrid->_ndimension; - int nl = nh-1; + // int nl = nh-1; //FIXME package in a convenient iterator //Should loop over a plane orthogonal to direction "Orthog" diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 4f46fb60..789fafbf 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -640,7 +640,7 @@ unvectorizeToRevLexOrdArray(std::vector &out, const Lattice &in) in_grid->iCoorFromIindex(in_icoor[lane], lane); } - parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index + thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //loop over outer index //Assemble vector of pointers to output elements std::vector out_ptrs(in_nsimd); @@ -661,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector &out, const Lattice &in) //Unpack into those ptrs const vobj & in_vobj = in._odata[in_oidx]; extract1(in_vobj, out_ptrs, 0); - } + }); } //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order @@ -733,7 +733,7 @@ vectorizeFromRevLexOrdArray( std::vector &in, Lattice &out) grid->iCoorFromIindex(icoor[lane],lane); } - parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index + thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //loop over outer index //Assemble vector of pointers to output elements std::vector ptrs(nsimd); @@ -757,7 +757,7 @@ vectorizeFromRevLexOrdArray( std::vector &in, Lattice &out) vobj vecobj; merge1(vecobj, ptrs, 0); out._odata[oidx] = vecobj; - } + }); } //Convert a Lattice from one precision to another