GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.

Local reduction to LatticeComplex and then further reduction.
2025-08-21 13:41:53 +01:00 · 2019-01-01 13:53:37 +00:00
parent 3eae9a9e3f
commit 715babeac8
4 changed files with 82 additions and 171 deletions
--- a/Grid/lattice/Lattice_comparison_utils.h
+++ b/Grid/lattice/Lattice_comparison_utils.h
@@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid);
  //
  template<class lobj,class robj> class veq {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class vne {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class vlt {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class vle {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class vgt {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class vge {
    public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
@@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid);
  // Generic list of functors
  template<class lobj,class robj> class seq {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class sne {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class slt {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class sle {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class sgt {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class sge {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
@@ -133,7 +133,7 @@ NAMESPACE_BEGIN(Grid);
  // Integer and real get extra relational functions.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -150,7 +150,7 @@ NAMESPACE_BEGIN(Grid);
    }
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -165,7 +165,7 @@ NAMESPACE_BEGIN(Grid);
    }
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid);
 #define DECLARE_RELATIONAL_EQ(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
+    accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+    accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+    accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+    accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
      return lhs._internal op rhs;					\
    }									\
  template<class vsimd>\
-    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+    accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
    }									\
@@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid);
 #define DECLARE_RELATIONAL(op,functor) \
  DECLARE_RELATIONAL_EQ(op,functor)    \
  template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
    }									
--- a/Grid/lattice/Lattice_overload.h
+++ b/Grid/lattice/Lattice_overload.h
@@ -1,138 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/lattice/Lattice_overload.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LATTICE_OVERLOAD_H
 #define GRID_LATTICE_OVERLOAD_H
 namespace Grid {
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // unary negation
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
    Lattice<vobj> ret(r._grid);
    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
      vstream(ret._odata[ss], -r._odata[ss]);
    }
    return ret;
  } 
  /////////////////////////////////////////////////////////////////////////////////////
  // Lattice BinOp Lattice,
  //NB mult performs conformable check. Do not reapply here for performance.
  /////////////////////////////////////////////////////////////////////////////////////
  template<class left,class right>
    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
  {
    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
    mult(ret,lhs,rhs);
    return ret;
  }
  template<class left,class right>
    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
  {
    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
    add(ret,lhs,rhs);
    return ret;
  }
  template<class left,class right>
    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
  {
    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
    sub(ret,lhs,rhs);
    return ret;
  }
  // Scalar BinOp Lattice ;generate return type
  template<class left,class right>
  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
  {
    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
      vstream(ret._odata[ss],tmp);
 	   //      ret._odata[ss]=lhs*rhs._odata[ss];
    }
    return ret;
  }
  template<class left,class right>
    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
    {
      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
 	vstream(ret._odata[ss],tmp);
 	//	ret._odata[ss]=lhs+rhs._odata[ss];
      }
        return ret;
    }
  template<class left,class right>
    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
  {
    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
      vstream(ret._odata[ss],tmp);
    }
    return ret;
  }
    template<class left,class right>
      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
    {
      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
 	vstream(ret._odata[ss],tmp);
 	//            ret._odata[ss]=lhs._odata[ss]*rhs;
      }
      return ret;
    }
    template<class left,class right>
      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
    {
        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
 	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
 	  vstream(ret._odata[ss],tmp);
 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
        }
        return ret;
    }
    template<class left,class right>
      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
    {
      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
 	  vstream(ret._odata[ss],tmp);
 	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
      }
      return ret;
    }
 }
 #endif
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -22,6 +22,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #pragma once
 #include <Grid/Grid_Eigen_Dense.h>
 #ifdef GRID_NVCC
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
 #include <thrust/reduce.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
 #endif
 NAMESPACE_BEGIN(Grid);
@@ -33,6 +41,25 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }
 #ifdef GRID_NVCC
 //#warning "ThrustReduce compiled"
 //#include <thrust/execution_policy.h>
 template<class vobj> 
 vobj ThrustNorm(const Lattice<vobj> &lat)
 {
  typedef typename vobj::scalar_type scalar_type;
  auto lat_v=lat.View();
  Integer s0=0;
  Integer sN=lat_v.end();
  scalar_type sum = 0;
  scalar_type * begin = (scalar_type *)&lat_v[s0];
  scalar_type * end   = (scalar_type *)&lat_v[sN];
  thrust::reduce(begin,end,sum);
  std::cout <<" thrust::reduce sum "<< sum << std::endl;
  return sum;
 }
 #endif
 // Double inner product
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -47,7 +74,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  auto left_v = left.View();
  auto right_v=right.View();
 #ifdef GRID_NVCC
  //#if 0
  typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t;
  Lattice<inner_t> inner_tmp(grid);
  /////////////////////////
  // localInnerProduct
  /////////////////////////
  auto inner_tmp_v = inner_tmp.View();
  accelerator_loop(ss,left_v,{
      inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss]));
  });
  /////////////////////////
  // and site sum the scalars
  /////////////////////////
  inner_t      vnrm = ThrustNorm(inner_tmp);
  auto vvnrm = vnrm;
 #else
  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
    int mywork, myoff;
    GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
@@ -63,6 +109,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  for(int i=0;i<grid->SumArraySize();i++){
    vvnrm = vvnrm+sumarray[i];
  } 
 #endif
  nrm = Reduce(vvnrm);// sum across simd
  right.Grid()->GlobalSum(nrm);
  return nrm;
@@ -102,7 +149,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
  {
    int nwork, mywork, myoff;
-    GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
+    nwork = x.Grid()->oSites();
    GridThread::GetWork(nwork,thr,mywork,myoff);
    // private to thread; sub summation
    decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); 
@@ -162,7 +210,8 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  auto arg_v=arg.View();
  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
    int nwork, mywork, myoff;
-    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    nwork = grid->oSites();
    GridThread::GetWork(nwork,thr,mywork,myoff);
    vobj vvsum=Zero();
    for(int ss=myoff;ss<mywork+myoff; ss++){
@@ -576,9 +625,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
+  //  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -629,9 +678,9 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
-  int nl=1;
+  //  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -685,9 +734,9 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
+  //  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -640,7 +640,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
-  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+  thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);
@@ -661,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
    //Unpack into those ptrs
    const vobj & in_vobj = in._odata[in_oidx];
    extract1(in_vobj, out_ptrs, 0);
-  }
+  });
 }
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
@@ -733,7 +733,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
    grid->iCoorFromIindex(icoor[lane],lane);
  }
-  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
+  thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> ptrs(nsimd);
@@ -757,7 +757,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
    vobj vecobj;
    merge1(vecobj, ptrs, 0);
    out._odata[oidx] = vecobj; 
-  }
+  });
 }
 //Convert a Lattice from one precision to another