From 715babeac8ea251bf5764bb2027a87148d6b7649 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 1 Jan 2019 13:53:37 +0000
Subject: [PATCH] GPU reductions first cut; use thrust, non-reproducible.
 Inclusive scan can fix this if desired. Local reduction to LatticeComplex and
 then further reduction.

---
 Grid/lattice/Lattice_comparison_utils.h |  42 ++++----
 Grid/lattice/Lattice_overload.h         | 138 ------------------------
 Grid/lattice/Lattice_reduction.h        |  65 +++++++++--
 Grid/lattice/Lattice_transfer.h         |   8 +-
 4 files changed, 82 insertions(+), 171 deletions(-)
 delete mode 100644 Grid/lattice/Lattice_overload.h
diff --git a/Grid/lattice/Lattice_comparison_utils.h b/Grid/lattice/Lattice_comparison_utils.h
index f7091977..431aa9e1 100644
--- a/Grid/lattice/Lattice_comparison_utils.h
+++ b/Grid/lattice/Lattice_comparison_utils.h
@@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid);
   //
   template<class lobj,class robj> class veq {
   public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) == (rhs);
     }
   };
   template<class lobj,class robj> class vne {
   public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) != (rhs);
     }
   };
   template<class lobj,class robj> class vlt {
   public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) < (rhs);
     }
   };
   template<class lobj,class robj> class vle {
   public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) <= (rhs);
     }
   };
   template<class lobj,class robj> class vgt {
   public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) > (rhs);
     }
   };
   template<class lobj,class robj> class vge {
     public:
-    vInteger operator()(const lobj &lhs, const robj &rhs)
+    accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) >= (rhs);
     }
@@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid);
   // Generic list of functors
   template<class lobj,class robj> class seq {
   public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) == (rhs);
     }
   };
   template<class lobj,class robj> class sne {
   public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) != (rhs);
     }
   };
   template<class lobj,class robj> class slt {
   public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) < (rhs);
     }
   };
   template<class lobj,class robj> class sle {
   public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) <= (rhs);
     }
   };
   template<class lobj,class robj> class sgt {
   public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) > (rhs);
     }
   };
   template<class lobj,class robj> class sge {
   public:
-    Integer operator()(const lobj &lhs, const robj &rhs)
+    accelerator Integer operator()(const lobj &lhs, const robj &rhs)
     { 
       return (lhs) >= (rhs);
     }
@@ -133,7 +133,7 @@ NAMESPACE_BEGIN(Grid);
   // Integer and real get extra relational functions.
   //////////////////////////////////////////////////////////////////////////////////////////////////////
   template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
     {
       typedef typename vsimd::scalar_type scalar;
       ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -150,7 +150,7 @@ NAMESPACE_BEGIN(Grid);
     }
 
   template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
     {
       typedef typename vsimd::scalar_type scalar;
       ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -165,7 +165,7 @@ NAMESPACE_BEGIN(Grid);
     }
 
   template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
+    accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
     {
       typedef typename vsimd::scalar_type scalar;
       ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
@@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid);
 
 #define DECLARE_RELATIONAL_EQ(op,functor) \
   template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
+    accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
     {\
       typedef typename vsimd::scalar_type scalar;\
       return Comparison(functor<scalar,scalar>(),lhs,rhs);\
     }\
   template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+    accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
     {\
       typedef typename vsimd::scalar_type scalar;\
       return Comparison(functor<scalar,scalar>(),lhs,rhs);\
     }\
   template<class vsimd,IfSimd<vsimd> = 0>\
-    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+    accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
     {\
       typedef typename vsimd::scalar_type scalar;\
       return Comparison(functor<scalar,scalar>(),lhs,rhs);\
     }\
   template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+    accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
     {									\
       return lhs._internal op rhs;					\
     }									\
   template<class vsimd>\
-    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+    accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
     {									\
       return lhs op rhs._internal;					\
     }									\
@@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid);
 #define DECLARE_RELATIONAL(op,functor) \
   DECLARE_RELATIONAL_EQ(op,functor)    \
   template<class vsimd>\
-    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
     {									\
       return lhs._internal op rhs._internal;				\
     }									
diff --git a/Grid/lattice/Lattice_overload.h b/Grid/lattice/Lattice_overload.h
deleted file mode 100644
index 0906b610..00000000
--- a/Grid/lattice/Lattice_overload.h
+++ /dev/null
@@ -1,138 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/lattice/Lattice_overload.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_LATTICE_OVERLOAD_H
-#define GRID_LATTICE_OVERLOAD_H
-
-namespace Grid {
-
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  // unary negation
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj>
-  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
-  {
-    Lattice<vobj> ret(r._grid);
-    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
-      vstream(ret._odata[ss], -r._odata[ss]);
-    }
-    return ret;
-  } 
-  /////////////////////////////////////////////////////////////////////////////////////
-  // Lattice BinOp Lattice,
-  //NB mult performs conformable check. Do not reapply here for performance.
-  /////////////////////////////////////////////////////////////////////////////////////
-  template<class left,class right>
-    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
-  {
-    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
-    mult(ret,lhs,rhs);
-    return ret;
-  }
-  template<class left,class right>
-    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
-  {
-    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
-    add(ret,lhs,rhs);
-    return ret;
-  }
-  template<class left,class right>
-    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
-  {
-    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
-    sub(ret,lhs,rhs);
-    return ret;
-  }
-  
-  // Scalar BinOp Lattice ;generate return type
-  template<class left,class right>
-  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
-  {
-    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
-      vstream(ret._odata[ss],tmp);
-	   //      ret._odata[ss]=lhs*rhs._odata[ss];
-    }
-    return ret;
-  }
-  template<class left,class right>
-    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
-    {
-      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
-      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
-	vstream(ret._odata[ss],tmp);
-	//	ret._odata[ss]=lhs+rhs._odata[ss];
-      }
-        return ret;
-    }
-  template<class left,class right>
-    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
-  {
-    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
-    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
-      vstream(ret._odata[ss],tmp);
-    }
-    return ret;
-  }
-    template<class left,class right>
-      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
-    {
-      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
-	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
-	vstream(ret._odata[ss],tmp);
-	//            ret._odata[ss]=lhs._odata[ss]*rhs;
-      }
-      return ret;
-    }
-    template<class left,class right>
-      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
-    {
-        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
-	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
-	  vstream(ret._odata[ss],tmp);
-	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
-        }
-        return ret;
-    }
-    template<class left,class right>
-      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
-    {
-      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
-      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
-	  vstream(ret._odata[ss],tmp);
-	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
-      }
-      return ret;
-    }
-}
-#endif
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 3f0b70d9..f8b63378 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -22,6 +22,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #pragma once
 
 #include <Grid/Grid_Eigen_Dense.h>
+#ifdef GRID_NVCC
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#endif
 
 NAMESPACE_BEGIN(Grid);
 
@@ -33,6 +41,25 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
   return real(nrm); 
 }
 
+#ifdef GRID_NVCC
+//#warning "ThrustReduce compiled"
+//#include <thrust/execution_policy.h>
+template<class vobj> 
+vobj ThrustNorm(const Lattice<vobj> &lat)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  auto lat_v=lat.View();
+  Integer s0=0;
+  Integer sN=lat_v.end();
+  scalar_type sum = 0;
+  scalar_type * begin = (scalar_type *)&lat_v[s0];
+  scalar_type * end   = (scalar_type *)&lat_v[sN];
+  thrust::reduce(begin,end,sum);
+  std::cout <<" thrust::reduce sum "<< sum << std::endl;
+  return sum;
+}
+#endif
+
 // Double inner product
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -47,7 +74,26 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
 
   auto left_v = left.View();
   auto right_v=right.View();
+#ifdef GRID_NVCC
+  //#if 0
 
+  typedef decltype(TensorRemove(innerProduct(left_v[0],right_v[0]))) inner_t;
+  
+  Lattice<inner_t> inner_tmp(grid);
+
+  /////////////////////////
+  // localInnerProduct
+  /////////////////////////
+  auto inner_tmp_v = inner_tmp.View();
+  accelerator_loop(ss,left_v,{
+      inner_tmp_v[ss] = TensorRemove(innerProduct(left_v[ss],right_v[ss]));
+  });
+  /////////////////////////
+  // and site sum the scalars
+  /////////////////////////
+  inner_t      vnrm = ThrustNorm(inner_tmp);
+  auto vvnrm = vnrm;
+#else
   thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
     int mywork, myoff;
     GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
@@ -63,6 +109,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
   for(int i=0;i<grid->SumArraySize();i++){
     vvnrm = vvnrm+sumarray[i];
   } 
+#endif
   nrm = Reduce(vvnrm);// sum across simd
   right.Grid()->GlobalSum(nrm);
   return nrm;
@@ -102,7 +149,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
   thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
   {
     int nwork, mywork, myoff;
-    GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
+    nwork = x.Grid()->oSites();
+    GridThread::GetWork(nwork,thr,mywork,myoff);
     
     // private to thread; sub summation
     decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); 
@@ -162,7 +210,8 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
   auto arg_v=arg.View();
   thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
     int nwork, mywork, myoff;
-    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    nwork = grid->oSites();
+    GridThread::GetWork(nwork,thr,mywork,myoff);
     
     vobj vvsum=Zero();
     for(int ss=myoff;ss<mywork+myoff; ss++){
@@ -576,9 +625,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   //  Lattice<vobj> Rslice(SliceGrid);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
   //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
+  //  int nl = nh-1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -629,9 +678,9 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
   //  Lattice<vobj> Rslice(SliceGrid);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
   //  int nl = SliceGrid->_ndimension;
-  int nl=1;
+  //  int nl=1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
@@ -685,9 +734,9 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 
   assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
+  //  int nh =  FullGrid->_ndimension;
   //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
+  //  int nl = nh-1;
 
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h
index 4f46fb60..789fafbf 100644
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@@ -640,7 +640,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
     in_grid->iCoorFromIindex(in_icoor[lane], lane);
   }
   
-  parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
+  thread_loop( (int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++),{ //loop over outer index
     //Assemble vector of pointers to output elements
     std::vector<sobj*> out_ptrs(in_nsimd);
 
@@ -661,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
     //Unpack into those ptrs
     const vobj & in_vobj = in._odata[in_oidx];
     extract1(in_vobj, out_ptrs, 0);
-  }
+  });
 }
 
 //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
@@ -733,7 +733,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
     grid->iCoorFromIindex(icoor[lane],lane);
   }
   
-  parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
+  thread_loop( (uint64_t oidx = 0; oidx < grid->oSites(); oidx++),{ //loop over outer index
     //Assemble vector of pointers to output elements
     std::vector<sobj*> ptrs(nsimd);
 
@@ -757,7 +757,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
     vobj vecobj;
     merge1(vecobj, ptrs, 0);
     out._odata[oidx] = vecobj; 
-  }
+  });
 }
 
 //Convert a Lattice from one precision to another