GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.

Local reduction to LatticeComplex and then further reduction.
2025-08-12 09:17:05 +01:00 · 2019-01-01 13:53:37 +00:00
parent 3eae9a9e3f
commit 715babeac8
4 changed files with 82 additions and 171 deletions
--- a/Grid/lattice/Lattice_comparison_utils.h
+++ b/Grid/lattice/Lattice_comparison_utils.h
@@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid);
  //
  template<class lobj,class robj> class veq {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class vne {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class vlt {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class vle {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class vgt {
  public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class vge {
    public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
@@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid);
  // Generic list of functors
  template<class lobj,class robj> class seq {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class sne {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class slt {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class sle {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class sgt {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class sge {
  public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
@@ -133,7 +133,7 @@ NAMESPACE_BEGIN(Grid);
  // Integer and real get extra relational functions.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -150,7 +150,7 @@ NAMESPACE_BEGIN(Grid);
    }

  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -165,7 +165,7 @@ NAMESPACE_BEGIN(Grid);
    }

  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
    {
      typedef typename vsimd::scalar_type scalar;
      ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid);

 #define DECLARE_RELATIONAL_EQ(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
+    accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+    accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+    accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+    accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
      return lhs._internal op rhs;					\
    }									\
  template<class vsimd>\
-    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+    accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
    }									\
@@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid);
 #define DECLARE_RELATIONAL(op,functor) \
  DECLARE_RELATIONAL_EQ(op,functor)    \
  template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
    }									
--- a/Grid/lattice/Lattice_overload.h
+++ b/Grid/lattice/Lattice_overload.h
@@ -1,138 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/lattice/Lattice_overload.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_LATTICE_OVERLOAD_H
-#define GRID_LATTICE_OVERLOAD_H
-
-namespace Grid {
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // unary negation
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj>
-  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
-  {
-    Lattice<vobj> ret(r._grid);
-    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
-      vstream(ret._odata[ss], -r._odata[ss]);
-    }
-    return ret;
-  } 
-  /////////////////////////////////////////////////////////////////////////////////////
-  // Lattice BinOp Lattice,
-  //NB mult performs conformable check. Do not reapply here for performance.
-  /////////////////////////////////////////////////////////////////////////////////////
-  template<class left,class right>
-    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
-  {
-    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
-    mult(ret,lhs,rhs);
-    return ret;
-  }
-  template<class left,class right>
-    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
-  {
-    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
-    add(ret,lhs,rhs);
-    return ret;
-  }
-  template<class left,class right>
-    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
-  {
-    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
-    sub(ret,lhs,rhs);
-    return ret;
-  }
-  
-  // Scalar BinOp Lattice ;generate return type
-  template<class left,class right>
-  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
-  {
-    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
-      vstream(ret._odata[ss],tmp);
-	   //      ret._odata[ss]=lhs*rhs._odata[ss];
-    }
-    return ret;
-  }
-  template<class left,class right>
-    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
-    {
-      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
-      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
-	vstream(ret._odata[ss],tmp);
-	//	ret._odata[ss]=lhs+rhs._odata[ss];
-      }
-        return ret;
-    }
-  template<class left,class right>
-    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
-  {
-    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
-    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
-      vstream(ret._odata[ss],tmp);
-    }
-    return ret;
-  }
-    template<class left,class right>
-      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
-    {
-      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
-	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
-	vstream(ret._odata[ss],tmp);
-	//            ret._odata[ss]=lhs._odata[ss]*rhs;
-      }
-      return ret;
-    }
-    template<class left,class right>
-      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
-    {
-        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
-	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
-	  vstream(ret._odata[ss],tmp);
-	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
-        }
-        return ret;
-    }
-    template<class left,class right>
-      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
-    {
-      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
-      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
-	  vstream(ret._odata[ss],tmp);
-	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
-      }
-      return ret;
-    }
-}
-#endif
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -22,6 +22,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #pragma once

 #include <Grid/Grid_Eigen_Dense.h>
+#ifdef GRID_NVCC
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#endif

 NAMESPACE_BEGIN(Grid);

@@ -33,6 +41,25 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

+#ifdef GRID_NVCC
+//#warning "ThrustReduce compiled"
+//#include <thrust/execution_policy.h>
+template<class vobj> 
+vobj ThrustNorm(const Lattice<vobj> &lat)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  auto lat_v=lat.View();
+  Integer s0=0;
+  Integer sN=lat_v.end();
+  scalar_type sum = 0;
+  scalar_type * begin = (scalar_type *)&lat_v[s0];
+  scalar_type * end   = (scalar_type *)&lat_v[sN];
+  thrust::reduce(begin,end,sum);
+  std::cout <<" thrust::reduce sum "<< sum << std::endl;
+  return sum;
+}
+#endif
+
 // Double inner product
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -47,7 +74,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ

  auto left_v = left.View();
  auto right_v=right.View();
+#ifdef GRID_NVCC
+  //#if 0

+  typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t;
+  
+  Lattice<inner_t> inner_tmp(grid);
+
+  /////////////////////////
+  // localInnerProduct
+  /////////////////////////
+  auto inner_tmp_v = inner_tmp.View();
+  accelerator_loop(ss,left_v,{
+      inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss]));
+  });
+  /////////////////////////
+  // and site sum the scalars
+  /////////////////////////
+  inner_t      vnrm = ThrustNorm(inner_tmp);
+  auto vvnrm = vnrm;
+#else
  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
    int mywork, myoff;
    GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
@@ -63,6 +109,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  for(int i=0;i<grid->SumArraySize();i++){
    vvnrm = vvnrm+sumarray[i];
  } 
+#endif
  nrm = Reduce(vvnrm);// sum across simd
  right.Grid()->GlobalSum(nrm);
  return nrm;
@@ -102,7 +149,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
  {
    int nwork, mywork, myoff;
-    GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
+    nwork = x.Grid()->oSites();
+    GridThread::GetWork(nwork,thr,mywork,myoff);
    
    // private to thread; sub summation
    decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); 
@@ -162,7 +210,8 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  auto arg_v=arg.View();
  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
    int nwork, mywork, myoff;
-    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    nwork = grid->oSites();
+    GridThread::GetWork(nwork,thr,mywork,myoff);
    
    vobj vvsum=Zero();
    for(int ss=myoff;ss<mywork+myoff; ss++){
@@ -576,9 +625,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
+  //  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -629,9 +678,9 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
-  int nl=1;
+  //  int nl=1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -685,9 +734,9 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
+  //  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -640,7 +640,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
    in_grid->iCoorFromIindex(in_icoor[lane], lane);
  }
  
-  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+  thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> out_ptrs(in_nsimd);

@@ -661,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
    //Unpack into those ptrs
    const vobj & in_vobj = in._odata[in_oidx];
    extract1(in_vobj, out_ptrs, 0);
-  }
+  });
 }

 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
@@ -733,7 +733,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
    grid->iCoorFromIindex(icoor[lane],lane);
  }
  
-  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
+  thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //loop over outer index
    //Assemble vector of pointers to output elements
    std::vector<sobj*> ptrs(nsimd);

@@ -757,7 +757,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
    vobj vecobj;
    merge1(vecobj, ptrs, 0);
    out._odata[oidx] = vecobj; 
-  }
+  });
 }

 //Convert a Lattice from one precision to another