1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.

Local reduction to LatticeComplex and then further reduction.
This commit is contained in:
Peter Boyle 2019-01-01 13:53:37 +00:00
parent 3eae9a9e3f
commit 715babeac8
4 changed files with 82 additions and 171 deletions

View File

@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid);
//
template<class lobj,class robj> class veq {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class vne {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class vlt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class vle {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class vgt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class vge {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid);
// Generic list of functors
template<class lobj,class robj> class seq {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class sne {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class slt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class sle {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class sgt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class sge {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
@ -133,7 +133,7 @@ NAMESPACE_BEGIN(Grid);
// Integer and real get extra relational functions.
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
@ -150,7 +150,7 @@ NAMESPACE_BEGIN(Grid);
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{
typedef typename vsimd::scalar_type scalar;
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
@ -165,7 +165,7 @@ NAMESPACE_BEGIN(Grid);
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid);
#define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \
return lhs._internal op rhs; \
} \
template<class vsimd>\
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \
return lhs op rhs._internal; \
} \
@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid);
#define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
}

View File

@ -1,138 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_overload.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_OVERLOAD_H
#define GRID_LATTICE_OVERLOAD_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
//NB mult performs conformable check. Do not reapply here for performance.
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@ -22,6 +22,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/reduce.h>
#endif
NAMESPACE_BEGIN(Grid);
@ -33,6 +41,25 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm);
}
#ifdef GRID_NVCC
//#warning "ThrustReduce compiled"
//#include <thrust/execution_policy.h>
template<class vobj>
vobj ThrustNorm(const Lattice<vobj> &lat)
{
typedef typename vobj::scalar_type scalar_type;
auto lat_v=lat.View();
Integer s0=0;
Integer sN=lat_v.end();
scalar_type sum = 0;
scalar_type * begin = (scalar_type *)&lat_v[s0];
scalar_type * end = (scalar_type *)&lat_v[sN];
thrust::reduce(begin,end,sum);
std::cout <<" thrust::reduce sum "<< sum << std::endl;
return sum;
}
#endif
// Double inner product
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@ -47,7 +74,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
auto left_v = left.View();
auto right_v=right.View();
#ifdef GRID_NVCC
//#if 0
typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t;
Lattice<inner_t> inner_tmp(grid);
/////////////////////////
// localInnerProduct
/////////////////////////
auto inner_tmp_v = inner_tmp.View();
accelerator_loop(ss,left_v,{
inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss]));
});
/////////////////////////
// and site sum the scalars
/////////////////////////
inner_t vnrm = ThrustNorm(inner_tmp);
auto vvnrm = vnrm;
#else
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
int mywork, myoff;
GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
@ -63,6 +109,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
}
#endif
nrm = Reduce(vvnrm);// sum across simd
right.Grid()->GlobalSum(nrm);
return nrm;
@ -102,7 +149,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
{
int nwork, mywork, myoff;
GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
nwork = x.Grid()->oSites();
GridThread::GetWork(nwork,thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero();
@ -162,7 +210,8 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
auto arg_v=arg.View();
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
nwork = grid->oSites();
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
@ -576,9 +625,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -629,9 +678,9 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl=1;
// int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -685,9 +734,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"

View File

@ -640,7 +640,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
in_grid->iCoorFromIindex(in_icoor[lane], lane);
}
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd);
@ -661,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
//Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx];
extract1(in_vobj, out_ptrs, 0);
}
});
}
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
@ -733,7 +733,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
grid->iCoorFromIindex(icoor[lane],lane);
}
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> ptrs(nsimd);
@ -757,7 +757,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
vobj vecobj;
merge1(vecobj, ptrs, 0);
out._odata[oidx] = vecobj;
}
});
}
//Convert a Lattice from one precision to another