mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.
Local reduction to LatticeComplex and then further reduction.
This commit is contained in:
parent
3eae9a9e3f
commit
715babeac8
@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//
|
//
|
||||||
template<class lobj,class robj> class veq {
|
template<class lobj,class robj> class veq {
|
||||||
public:
|
public:
|
||||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) == (rhs);
|
return (lhs) == (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class vne {
|
template<class lobj,class robj> class vne {
|
||||||
public:
|
public:
|
||||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) != (rhs);
|
return (lhs) != (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class vlt {
|
template<class lobj,class robj> class vlt {
|
||||||
public:
|
public:
|
||||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) < (rhs);
|
return (lhs) < (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class vle {
|
template<class lobj,class robj> class vle {
|
||||||
public:
|
public:
|
||||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) <= (rhs);
|
return (lhs) <= (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class vgt {
|
template<class lobj,class robj> class vgt {
|
||||||
public:
|
public:
|
||||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) > (rhs);
|
return (lhs) > (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class vge {
|
template<class lobj,class robj> class vge {
|
||||||
public:
|
public:
|
||||||
vInteger operator()(const lobj &lhs, const robj &rhs)
|
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) >= (rhs);
|
return (lhs) >= (rhs);
|
||||||
}
|
}
|
||||||
@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Generic list of functors
|
// Generic list of functors
|
||||||
template<class lobj,class robj> class seq {
|
template<class lobj,class robj> class seq {
|
||||||
public:
|
public:
|
||||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) == (rhs);
|
return (lhs) == (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class sne {
|
template<class lobj,class robj> class sne {
|
||||||
public:
|
public:
|
||||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) != (rhs);
|
return (lhs) != (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class slt {
|
template<class lobj,class robj> class slt {
|
||||||
public:
|
public:
|
||||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) < (rhs);
|
return (lhs) < (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class sle {
|
template<class lobj,class robj> class sle {
|
||||||
public:
|
public:
|
||||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) <= (rhs);
|
return (lhs) <= (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class sgt {
|
template<class lobj,class robj> class sgt {
|
||||||
public:
|
public:
|
||||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) > (rhs);
|
return (lhs) > (rhs);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class lobj,class robj> class sge {
|
template<class lobj,class robj> class sge {
|
||||||
public:
|
public:
|
||||||
Integer operator()(const lobj &lhs, const robj &rhs)
|
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
|
||||||
{
|
{
|
||||||
return (lhs) >= (rhs);
|
return (lhs) >= (rhs);
|
||||||
}
|
}
|
||||||
@ -133,7 +133,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Integer and real get extra relational functions.
|
// Integer and real get extra relational functions.
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
||||||
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
|
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
|
||||||
{
|
{
|
||||||
typedef typename vsimd::scalar_type scalar;
|
typedef typename vsimd::scalar_type scalar;
|
||||||
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
@ -150,7 +150,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
||||||
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
|
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
|
||||||
{
|
{
|
||||||
typedef typename vsimd::scalar_type scalar;
|
typedef typename vsimd::scalar_type scalar;
|
||||||
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
@ -165,7 +165,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
|
||||||
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
|
accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
|
||||||
{
|
{
|
||||||
typedef typename vsimd::scalar_type scalar;
|
typedef typename vsimd::scalar_type scalar;
|
||||||
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
|
||||||
@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#define DECLARE_RELATIONAL_EQ(op,functor) \
|
#define DECLARE_RELATIONAL_EQ(op,functor) \
|
||||||
template<class vsimd,IfSimd<vsimd> = 0>\
|
template<class vsimd,IfSimd<vsimd> = 0>\
|
||||||
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
|
accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
|
||||||
{\
|
{\
|
||||||
typedef typename vsimd::scalar_type scalar;\
|
typedef typename vsimd::scalar_type scalar;\
|
||||||
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
|
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
|
||||||
}\
|
}\
|
||||||
template<class vsimd,IfSimd<vsimd> = 0>\
|
template<class vsimd,IfSimd<vsimd> = 0>\
|
||||||
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
|
accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
|
||||||
{\
|
{\
|
||||||
typedef typename vsimd::scalar_type scalar;\
|
typedef typename vsimd::scalar_type scalar;\
|
||||||
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
|
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
|
||||||
}\
|
}\
|
||||||
template<class vsimd,IfSimd<vsimd> = 0>\
|
template<class vsimd,IfSimd<vsimd> = 0>\
|
||||||
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
|
accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
|
||||||
{\
|
{\
|
||||||
typedef typename vsimd::scalar_type scalar;\
|
typedef typename vsimd::scalar_type scalar;\
|
||||||
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
|
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
|
||||||
}\
|
}\
|
||||||
template<class vsimd>\
|
template<class vsimd>\
|
||||||
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
|
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
|
||||||
{ \
|
{ \
|
||||||
return lhs._internal op rhs; \
|
return lhs._internal op rhs; \
|
||||||
} \
|
} \
|
||||||
template<class vsimd>\
|
template<class vsimd>\
|
||||||
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
|
accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
|
||||||
{ \
|
{ \
|
||||||
return lhs op rhs._internal; \
|
return lhs op rhs._internal; \
|
||||||
} \
|
} \
|
||||||
@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define DECLARE_RELATIONAL(op,functor) \
|
#define DECLARE_RELATIONAL(op,functor) \
|
||||||
DECLARE_RELATIONAL_EQ(op,functor) \
|
DECLARE_RELATIONAL_EQ(op,functor) \
|
||||||
template<class vsimd>\
|
template<class vsimd>\
|
||||||
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
|
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
|
||||||
{ \
|
{ \
|
||||||
return lhs._internal op rhs._internal; \
|
return lhs._internal op rhs._internal; \
|
||||||
}
|
}
|
||||||
|
@ -1,138 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/lattice/Lattice_overload.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_LATTICE_OVERLOAD_H
|
|
||||||
#define GRID_LATTICE_OVERLOAD_H
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// unary negation
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj>
|
|
||||||
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
|
|
||||||
{
|
|
||||||
Lattice<vobj> ret(r._grid);
|
|
||||||
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
|
|
||||||
vstream(ret._odata[ss], -r._odata[ss]);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Lattice BinOp Lattice,
|
|
||||||
//NB mult performs conformable check. Do not reapply here for performance.
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
|
|
||||||
mult(ret,lhs,rhs);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
|
|
||||||
add(ret,lhs,rhs);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
|
|
||||||
sub(ret,lhs,rhs);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Scalar BinOp Lattice ;generate return type
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
|
|
||||||
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
|
|
||||||
vstream(ret._odata[ss],tmp);
|
|
||||||
// ret._odata[ss]=lhs*rhs._odata[ss];
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
|
|
||||||
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
|
|
||||||
vstream(ret._odata[ss],tmp);
|
|
||||||
// ret._odata[ss]=lhs+rhs._odata[ss];
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
|
|
||||||
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
|
|
||||||
vstream(ret._odata[ss],tmp);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
|
|
||||||
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
|
|
||||||
vstream(ret._odata[ss],tmp);
|
|
||||||
// ret._odata[ss]=lhs._odata[ss]*rhs;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
|
|
||||||
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
|
|
||||||
vstream(ret._odata[ss],tmp);
|
|
||||||
// ret._odata[ss]=lhs._odata[ss]+rhs;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class left,class right>
|
|
||||||
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
|
|
||||||
{
|
|
||||||
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
|
|
||||||
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
|
|
||||||
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
|
|
||||||
vstream(ret._odata[ss],tmp);
|
|
||||||
// ret._odata[ss]=lhs._odata[ss]-rhs;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -22,6 +22,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <Grid/Grid_Eigen_Dense.h>
|
#include <Grid/Grid_Eigen_Dense.h>
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
#include <thrust/host_vector.h>
|
||||||
|
#include <thrust/device_vector.h>
|
||||||
|
#include <thrust/generate.h>
|
||||||
|
#include <thrust/reduce.h>
|
||||||
|
#include <thrust/functional.h>
|
||||||
|
#include <thrust/reduce.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -33,6 +41,25 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
|||||||
return real(nrm);
|
return real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
//#warning "ThrustReduce compiled"
|
||||||
|
//#include <thrust/execution_policy.h>
|
||||||
|
template<class vobj>
|
||||||
|
vobj ThrustNorm(const Lattice<vobj> &lat)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
auto lat_v=lat.View();
|
||||||
|
Integer s0=0;
|
||||||
|
Integer sN=lat_v.end();
|
||||||
|
scalar_type sum = 0;
|
||||||
|
scalar_type * begin = (scalar_type *)&lat_v[s0];
|
||||||
|
scalar_type * end = (scalar_type *)&lat_v[sN];
|
||||||
|
thrust::reduce(begin,end,sum);
|
||||||
|
std::cout <<" thrust::reduce sum "<< sum << std::endl;
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Double inner product
|
// Double inner product
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
||||||
@ -47,7 +74,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
|
|
||||||
auto left_v = left.View();
|
auto left_v = left.View();
|
||||||
auto right_v=right.View();
|
auto right_v=right.View();
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
//#if 0
|
||||||
|
|
||||||
|
typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t;
|
||||||
|
|
||||||
|
Lattice<inner_t> inner_tmp(grid);
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// localInnerProduct
|
||||||
|
/////////////////////////
|
||||||
|
auto inner_tmp_v = inner_tmp.View();
|
||||||
|
accelerator_loop(ss,left_v,{
|
||||||
|
inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss]));
|
||||||
|
});
|
||||||
|
/////////////////////////
|
||||||
|
// and site sum the scalars
|
||||||
|
/////////////////////////
|
||||||
|
inner_t vnrm = ThrustNorm(inner_tmp);
|
||||||
|
auto vvnrm = vnrm;
|
||||||
|
#else
|
||||||
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
|
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
|
||||||
int mywork, myoff;
|
int mywork, myoff;
|
||||||
GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
|
||||||
@ -63,6 +109,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vvnrm = vvnrm+sumarray[i];
|
vvnrm = vvnrm+sumarray[i];
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
nrm = Reduce(vvnrm);// sum across simd
|
nrm = Reduce(vvnrm);// sum across simd
|
||||||
right.Grid()->GlobalSum(nrm);
|
right.Grid()->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
@ -102,7 +149,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
|
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
|
||||||
{
|
{
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
|
nwork = x.Grid()->oSites();
|
||||||
|
GridThread::GetWork(nwork,thr,mywork,myoff);
|
||||||
|
|
||||||
// private to thread; sub summation
|
// private to thread; sub summation
|
||||||
decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero();
|
decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero();
|
||||||
@ -162,7 +210,8 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
|||||||
auto arg_v=arg.View();
|
auto arg_v=arg.View();
|
||||||
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
|
thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
|
nwork = grid->oSites();
|
||||||
|
GridThread::GetWork(nwork,thr,mywork,myoff);
|
||||||
|
|
||||||
vobj vvsum=Zero();
|
vobj vvsum=Zero();
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
@ -576,9 +625,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
// Lattice<vobj> Rslice(SliceGrid);
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
int nh = FullGrid->_ndimension;
|
// int nh = FullGrid->_ndimension;
|
||||||
// int nl = SliceGrid->_ndimension;
|
// int nl = SliceGrid->_ndimension;
|
||||||
int nl = nh-1;
|
// int nl = nh-1;
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
//FIXME package in a convenient iterator
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
@ -629,9 +678,9 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
// Lattice<vobj> Rslice(SliceGrid);
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
int nh = FullGrid->_ndimension;
|
// int nh = FullGrid->_ndimension;
|
||||||
// int nl = SliceGrid->_ndimension;
|
// int nl = SliceGrid->_ndimension;
|
||||||
int nl=1;
|
// int nl=1;
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
//FIXME package in a convenient iterator
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
@ -685,9 +734,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||||
|
|
||||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
int nh = FullGrid->_ndimension;
|
// int nh = FullGrid->_ndimension;
|
||||||
// int nl = SliceGrid->_ndimension;
|
// int nl = SliceGrid->_ndimension;
|
||||||
int nl = nh-1;
|
// int nl = nh-1;
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
//FIXME package in a convenient iterator
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
@ -640,7 +640,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
in_grid->iCoorFromIindex(in_icoor[lane], lane);
|
in_grid->iCoorFromIindex(in_icoor[lane], lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
|
thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //loop over outer index
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> out_ptrs(in_nsimd);
|
std::vector<sobj*> out_ptrs(in_nsimd);
|
||||||
|
|
||||||
@ -661,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
//Unpack into those ptrs
|
//Unpack into those ptrs
|
||||||
const vobj & in_vobj = in._odata[in_oidx];
|
const vobj & in_vobj = in._odata[in_oidx];
|
||||||
extract1(in_vobj, out_ptrs, 0);
|
extract1(in_vobj, out_ptrs, 0);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
|
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
|
||||||
@ -733,7 +733,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
grid->iCoorFromIindex(icoor[lane],lane);
|
grid->iCoorFromIindex(icoor[lane],lane);
|
||||||
}
|
}
|
||||||
|
|
||||||
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
|
thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //loop over outer index
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
std::vector<sobj*> ptrs(nsimd);
|
std::vector<sobj*> ptrs(nsimd);
|
||||||
|
|
||||||
@ -757,7 +757,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
vobj vecobj;
|
vobj vecobj;
|
||||||
merge1(vecobj, ptrs, 0);
|
merge1(vecobj, ptrs, 0);
|
||||||
out._odata[oidx] = vecobj;
|
out._odata[oidx] = vecobj;
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//Convert a Lattice from one precision to another
|
//Convert a Lattice from one precision to another
|
||||||
|
Loading…
Reference in New Issue
Block a user