mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Reproducible reduction and axpy_norm offload from Gianluca.
Hopefully get CG running entirely on GPU
This commit is contained in:
		@@ -23,154 +23,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
#include <thrust/inner_product.h>
 | 
			
		||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Deterministic Reduction operations
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
 | 
			
		||||
  ComplexD nrm = innerProduct(arg,arg);
 | 
			
		||||
  return real(nrm); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
template<class T, class R>
 | 
			
		||||
struct innerProductFunctor : public thrust::binary_function<T,T,R>
 | 
			
		||||
{
 | 
			
		||||
  accelerator R operator()(T x, T y) { return innerProduct(x,y); }
 | 
			
		||||
};
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Double inner product
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  scalar_type  nrm;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = left.Grid();
 | 
			
		||||
  
 | 
			
		||||
  Vector<vector_type> sumarray(grid->SumArraySize());
 | 
			
		||||
 | 
			
		||||
  auto left_v = left.View();
 | 
			
		||||
  auto right_v=right.View();
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
 | 
			
		||||
  thrust::plus<inner_t> binary_sum;
 | 
			
		||||
  innerProductFunctor<vobj,inner_t> binary_inner_p;
 | 
			
		||||
  Integer sN = left_v.end();
 | 
			
		||||
  inner_t zero = Zero();
 | 
			
		||||
  // is there a way of using the efficient thrust reduction while maintaining memory coalescing?
 | 
			
		||||
  inner_t vnrm = thrust::inner_product(thrust::device, &left_v[0], &left_v[sN], &right_v[0], zero, binary_sum, binary_inner_p);
 | 
			
		||||
  nrm = Reduce(TensorRemove(vnrm));// sum across simd
 | 
			
		||||
#else
 | 
			
		||||
  thread_for( thr,grid->SumArraySize(),{
 | 
			
		||||
    int mywork, myoff;
 | 
			
		||||
    GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
 | 
			
		||||
    
 | 
			
		||||
    decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation
 | 
			
		||||
    for(int ss=myoff;ss<mywork+myoff; ss++){
 | 
			
		||||
      vnrm = vnrm + innerProductD(left_v[ss],right_v[ss]);
 | 
			
		||||
    }
 | 
			
		||||
    sumarray[thr]=TensorRemove(vnrm) ;
 | 
			
		||||
  });
 | 
			
		||||
  
 | 
			
		||||
  vector_type vvnrm; vvnrm=Zero();  // sum across threads
 | 
			
		||||
  for(int i=0;i<grid->SumArraySize();i++){
 | 
			
		||||
    vvnrm = vvnrm+sumarray[i];
 | 
			
		||||
  }
 | 
			
		||||
  nrm = Reduce(vvnrm);// sum across simd
 | 
			
		||||
#endif
 | 
			
		||||
  right.Grid()->GlobalSum(nrm);
 | 
			
		||||
  return nrm;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////
 | 
			
		||||
// Fast axpby_norm
 | 
			
		||||
// z = a x + b y
 | 
			
		||||
// return norm z
 | 
			
		||||
/////////////////////////
 | 
			
		||||
template<class sobj,class vobj> strong_inline RealD 
 | 
			
		||||
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 | 
			
		||||
{
 | 
			
		||||
  sobj one(1.0);
 | 
			
		||||
  return axpby_norm_fast(z,a,one,x,y);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class sobj,class vobj> strong_inline RealD 
 | 
			
		||||
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 | 
			
		||||
{
 | 
			
		||||
  const int pad = 8;
 | 
			
		||||
  z.Checkerboard() = x.Checkerboard();
 | 
			
		||||
  conformable(z,x);
 | 
			
		||||
  conformable(x,y);
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  RealD  nrm;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = x.Grid();
 | 
			
		||||
  
 | 
			
		||||
  Vector<RealD> sumarray(grid->SumArraySize()*pad);
 | 
			
		||||
  
 | 
			
		||||
  auto x_v=x.View();
 | 
			
		||||
  auto y_v=y.View();
 | 
			
		||||
  auto z_v=z.View();
 | 
			
		||||
  thread_for(thr,grid->SumArraySize(),
 | 
			
		||||
  {
 | 
			
		||||
    int nwork, mywork, myoff;
 | 
			
		||||
    nwork = x.Grid()->oSites();
 | 
			
		||||
    GridThread::GetWork(nwork,thr,mywork,myoff);
 | 
			
		||||
    
 | 
			
		||||
    // private to thread; sub summation
 | 
			
		||||
    decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); 
 | 
			
		||||
    for(int ss=myoff;ss<mywork+myoff; ss++){
 | 
			
		||||
      vobj tmp = a*x_v[ss]+b*y_v[ss];
 | 
			
		||||
      vnrm = vnrm + innerProductD(tmp,tmp);
 | 
			
		||||
      vstream(z_v[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
 | 
			
		||||
  });
 | 
			
		||||
  
 | 
			
		||||
  nrm = 0.0; // sum across threads; linear in thread count but fast
 | 
			
		||||
  for(int i=0;i<grid->SumArraySize();i++){
 | 
			
		||||
    nrm = nrm+sumarray[i*pad];
 | 
			
		||||
  } 
 | 
			
		||||
  z.Grid()->GlobalSum(nrm);
 | 
			
		||||
  return nrm; 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
template<class Op,class T1>
 | 
			
		||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
 | 
			
		||||
  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
 | 
			
		||||
{
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Op,class T1,class T2>
 | 
			
		||||
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
 | 
			
		||||
      ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
 | 
			
		||||
{
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Op,class T1,class T2,class T3>
 | 
			
		||||
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 | 
			
		||||
  ->typename decltype(expr.op.func(eval(0,expr.arg1),
 | 
			
		||||
				      eval(0,expr.arg2),
 | 
			
		||||
				      eval(0,expr.arg3)
 | 
			
		||||
				      ))::scalar_object
 | 
			
		||||
{
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifndef GRID_NVCC
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
{
 | 
			
		||||
@@ -211,8 +69,124 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
 | 
			
		||||
  
 | 
			
		||||
  return ssum;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Deterministic Reduction operations
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
 | 
			
		||||
  ComplexD nrm = innerProduct(arg,arg);
 | 
			
		||||
  return real(nrm); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Double inner product
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  scalar_type  nrm;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = left.Grid();
 | 
			
		||||
  
 | 
			
		||||
  // Might make all code paths go this way.
 | 
			
		||||
  auto left_v = left.View();
 | 
			
		||||
  auto right_v=right.View();
 | 
			
		||||
 | 
			
		||||
  const uint64_t nsimd = grid->Nsimd();
 | 
			
		||||
  const uint64_t sites = grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
 | 
			
		||||
  Lattice<inner_t> inner_tmp(grid);
 | 
			
		||||
  auto inner_tmp_v = inner_tmp.View();
 | 
			
		||||
  
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto x_l = left_v(ss);
 | 
			
		||||
      auto y_l = right_v(ss);
 | 
			
		||||
      coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
 | 
			
		||||
  })
 | 
			
		||||
 | 
			
		||||
  nrm = TensorRemove(sum(inner_tmp));
 | 
			
		||||
 | 
			
		||||
  right.Grid()->GlobalSum(nrm);
 | 
			
		||||
  return nrm;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////
 | 
			
		||||
// Fast axpby_norm
 | 
			
		||||
// z = a x + b y
 | 
			
		||||
// return norm z
 | 
			
		||||
/////////////////////////
 | 
			
		||||
template<class sobj,class vobj> strong_inline RealD 
 | 
			
		||||
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 | 
			
		||||
{
 | 
			
		||||
  sobj one(1.0);
 | 
			
		||||
  return axpby_norm_fast(z,a,one,x,y);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class sobj,class vobj> strong_inline RealD 
 | 
			
		||||
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
 | 
			
		||||
{
 | 
			
		||||
  const int pad = 8;
 | 
			
		||||
  z.Checkerboard() = x.Checkerboard();
 | 
			
		||||
  conformable(z,x);
 | 
			
		||||
  conformable(x,y);
 | 
			
		||||
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_typeD vector_type;
 | 
			
		||||
  RealD  nrm;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *grid = x.Grid();
 | 
			
		||||
 | 
			
		||||
  auto x_v=x.View();
 | 
			
		||||
  auto y_v=y.View();
 | 
			
		||||
  auto z_v=z.View();
 | 
			
		||||
 | 
			
		||||
  const uint64_t nsimd = grid->Nsimd();
 | 
			
		||||
  const uint64_t sites = grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
 | 
			
		||||
  Lattice<inner_t> inner_tmp(grid);
 | 
			
		||||
 | 
			
		||||
  accelerator_for( ss, sites, nsimd,{
 | 
			
		||||
      auto tmp = a*x_v(ss)+b*y_v(ss);
 | 
			
		||||
      coalescedWrite(inner_tmp[ss],innerProduct(tmp,tmp));
 | 
			
		||||
      coalescedWrite(z_v[ss],tmp);
 | 
			
		||||
  })
 | 
			
		||||
  
 | 
			
		||||
  nrm = TensorRemove(sum(inner_tmp));
 | 
			
		||||
 | 
			
		||||
  z.Grid()->GlobalSum(nrm);
 | 
			
		||||
  return nrm; 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
template<class Op,class T1>
 | 
			
		||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
 | 
			
		||||
  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
 | 
			
		||||
{
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Op,class T1,class T2>
 | 
			
		||||
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
 | 
			
		||||
      ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
 | 
			
		||||
{
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Op,class T1,class T2,class T3>
 | 
			
		||||
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
 | 
			
		||||
  ->typename decltype(expr.op.func(eval(0,expr.arg1),
 | 
			
		||||
				      eval(0,expr.arg2),
 | 
			
		||||
				      eval(0,expr.arg3)
 | 
			
		||||
				      ))::scalar_object
 | 
			
		||||
{
 | 
			
		||||
  return sum(closure(expr));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										221
									
								
								Grid/lattice/Lattice_reduction_gpu.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										221
									
								
								Grid/lattice/Lattice_reduction_gpu.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,221 @@
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
#define WARP_SIZE 32
 | 
			
		||||
extern cudaDeviceProp *gpu_props;
 | 
			
		||||
__device__ unsigned int retirementCount = 0;
 | 
			
		||||
 | 
			
		||||
template <class Iterator>
 | 
			
		||||
unsigned int nextPow2(Iterator x) {
 | 
			
		||||
  --x;
 | 
			
		||||
  x |= x >> 1;
 | 
			
		||||
  x |= x >> 2;
 | 
			
		||||
  x |= x >> 4;
 | 
			
		||||
  x |= x >> 8;
 | 
			
		||||
  x |= x >> 16;
 | 
			
		||||
  return ++x;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class Iterator>
 | 
			
		||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
 | 
			
		||||
  
 | 
			
		||||
  int device;
 | 
			
		||||
  cudaGetDevice(&device);
 | 
			
		||||
  
 | 
			
		||||
  Iterator warpSize            = gpu_props[device].warpSize;
 | 
			
		||||
  Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
 | 
			
		||||
  Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;
 | 
			
		||||
  Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogDebug << "GPU has:" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\twarpSize            = " << warpSize << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tsharedMemPerBlock   = " << sharedMemPerBlock << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << maxThreadsPerBlock << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tmaxThreadsPerBlock  = " << warpSize << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  if (warpSize != WARP_SIZE) {
 | 
			
		||||
    std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
 | 
			
		||||
    exit(EXIT_FAILURE);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // let the number of threads in a block be a multiple of 2, starting from warpSize
 | 
			
		||||
  threads = warpSize;
 | 
			
		||||
  while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
 | 
			
		||||
  // keep all the streaming multiprocessors busy
 | 
			
		||||
  blocks = nextPow2(multiProcessorCount);
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class sobj, class Iterator>
 | 
			
		||||
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
 | 
			
		||||
  
 | 
			
		||||
  Iterator blockSize = blockDim.x;
 | 
			
		||||
  
 | 
			
		||||
  // cannot use overloaded operators for sobj as they are not volatile-qualified
 | 
			
		||||
  memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
 | 
			
		||||
  __syncwarp();
 | 
			
		||||
  
 | 
			
		||||
  const Iterator VEC = WARP_SIZE;
 | 
			
		||||
  const Iterator vid = tid & (VEC-1);
 | 
			
		||||
  
 | 
			
		||||
  sobj beta, temp;
 | 
			
		||||
  memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
 | 
			
		||||
  
 | 
			
		||||
  for (int i = VEC/2; i > 0; i>>=1) {
 | 
			
		||||
    if (vid < i) {
 | 
			
		||||
      memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
 | 
			
		||||
      beta += temp;
 | 
			
		||||
      memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
 | 
			
		||||
    }
 | 
			
		||||
    __syncwarp();
 | 
			
		||||
  }
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
  
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    beta  = Zero();
 | 
			
		||||
    for (Iterator i = 0; i < blockSize; i += VEC) {
 | 
			
		||||
      memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
 | 
			
		||||
      beta  += temp;
 | 
			
		||||
    }
 | 
			
		||||
    memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
 | 
			
		||||
  }
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj, class Iterator>
 | 
			
		||||
__device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::scalar_object *g_odata, Iterator n) {
 | 
			
		||||
  
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type);
 | 
			
		||||
  
 | 
			
		||||
  Iterator blockSize = blockDim.x;
 | 
			
		||||
  
 | 
			
		||||
  // force shared memory alignment
 | 
			
		||||
  extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
 | 
			
		||||
  // it's not possible to have two extern __shared__ arrays with same name
 | 
			
		||||
  // but different types in different scopes -- need to cast each time
 | 
			
		||||
  sobj *sdata = (sobj *)shmem_pointer;
 | 
			
		||||
  
 | 
			
		||||
  // first level of reduction,
 | 
			
		||||
  // each thread writes result in mySum
 | 
			
		||||
  Iterator tid = threadIdx.x;
 | 
			
		||||
  Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
 | 
			
		||||
  Iterator gridSize = blockSize*2*gridDim.x;
 | 
			
		||||
  sobj mySum = Zero();
 | 
			
		||||
  
 | 
			
		||||
  while (i < n) {
 | 
			
		||||
    Iterator lane = i % nsimd;
 | 
			
		||||
    Iterator ss   = i / nsimd;
 | 
			
		||||
    auto tmp = extractLane(lane,g_idata[ss]);
 | 
			
		||||
    mySum += tmp;
 | 
			
		||||
    
 | 
			
		||||
    if (i + blockSize < n) {
 | 
			
		||||
      lane = (i+blockSize) % nsimd;
 | 
			
		||||
      ss   = (i+blockSize) / nsimd;
 | 
			
		||||
      tmp = extractLane(lane,g_idata[ss]);
 | 
			
		||||
      mySum += tmp;
 | 
			
		||||
    }
 | 
			
		||||
    i += gridSize;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // copy mySum to shared memory and perform
 | 
			
		||||
  // reduction for all threads in this block
 | 
			
		||||
  reduceBlock(sdata, mySum, tid);
 | 
			
		||||
  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj, class Iterator>
 | 
			
		||||
__global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_object *buffer, Iterator n) {
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  
 | 
			
		||||
  Iterator blockSize = blockDim.x;
 | 
			
		||||
  
 | 
			
		||||
  // perform reduction for this block and
 | 
			
		||||
  // write result to global memory buffer
 | 
			
		||||
  reduceBlocks(lat, buffer, n);
 | 
			
		||||
  
 | 
			
		||||
  if (gridDim.x > 1) {
 | 
			
		||||
    
 | 
			
		||||
    const Iterator tid = threadIdx.x;
 | 
			
		||||
    __shared__ bool amLast;
 | 
			
		||||
    // force shared memory alignment
 | 
			
		||||
    extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
 | 
			
		||||
    // it's not possible to have two extern __shared__ arrays with same name
 | 
			
		||||
    // but different types in different scopes -- need to cast each time
 | 
			
		||||
    sobj *smem = (sobj *)shmem_pointer;
 | 
			
		||||
    
 | 
			
		||||
    // wait until all outstanding memory instructions in this thread are finished
 | 
			
		||||
    __threadfence();
 | 
			
		||||
    
 | 
			
		||||
    if (tid==0) {
 | 
			
		||||
      unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
 | 
			
		||||
      // true if this block is the last block to be done
 | 
			
		||||
      amLast = (ticket == gridDim.x-1);
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    // each thread must read the correct value of amLast
 | 
			
		||||
    __syncthreads();
 | 
			
		||||
    
 | 
			
		||||
    if (amLast) {
 | 
			
		||||
      // reduce buffer[0], ..., buffer[gridDim.x-1]
 | 
			
		||||
      Iterator i = tid;
 | 
			
		||||
      sobj mySum = Zero();
 | 
			
		||||
      
 | 
			
		||||
      while (i < gridDim.x) {
 | 
			
		||||
        mySum += buffer[i];
 | 
			
		||||
        i += blockSize;
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      reduceBlock(smem, mySum, tid);
 | 
			
		||||
      
 | 
			
		||||
      if (tid==0) {
 | 
			
		||||
        buffer[0] = smem[0];
 | 
			
		||||
        // reset count variable
 | 
			
		||||
        retirementCount = 0;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class vobj>
 | 
			
		||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &lat) 
 | 
			
		||||
{
 | 
			
		||||
  
 | 
			
		||||
  LatticeView<vobj> lat_v = lat.View();
 | 
			
		||||
  
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef decltype(lat_v.begin()) Iterator;
 | 
			
		||||
  
 | 
			
		||||
  Iterator size = lat.Grid()->lSites();
 | 
			
		||||
  
 | 
			
		||||
  Iterator numThreads, numBlocks;
 | 
			
		||||
  getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
 | 
			
		||||
  Iterator smemSize = numThreads * sizeof(sobj);
 | 
			
		||||
  /*  
 | 
			
		||||
  std::cout << GridLogDebug << "Starting reduction with:" << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tsize       = " << size << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tnumBlocks  = " << numBlocks << std::endl;
 | 
			
		||||
  std::cout << GridLogDebug << "\tsmemSize   = " << smemSize << std::endl;
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
  Vector<sobj> buffer(numBlocks);
 | 
			
		||||
  sobj *buffer_v = &buffer[0];
 | 
			
		||||
  
 | 
			
		||||
  reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size);
 | 
			
		||||
  cudaDeviceSynchronize();
 | 
			
		||||
  
 | 
			
		||||
  cudaError err = cudaGetLastError();
 | 
			
		||||
  if ( cudaSuccess != err ) {
 | 
			
		||||
    printf("Cuda error %s\n",cudaGetErrorString( err ));
 | 
			
		||||
    exit(0);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  auto result = buffer_v[0];
 | 
			
		||||
  lat.Grid()->GlobalSum(result);
 | 
			
		||||
  return result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
@@ -108,12 +108,16 @@ void fillScalar(scalar &s,distribution &dist,generator & gen)
 | 
			
		||||
template<class distribution,class generator> 
 | 
			
		||||
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
 | 
			
		||||
{
 | 
			
		||||
  s=ComplexF(dist(gen),dist(gen));
 | 
			
		||||
  //  s=ComplexF(dist(gen),dist(gen));
 | 
			
		||||
  s.real(dist(gen));
 | 
			
		||||
  s.imag(dist(gen));
 | 
			
		||||
}
 | 
			
		||||
template<class distribution,class generator> 
 | 
			
		||||
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
 | 
			
		||||
{
 | 
			
		||||
  s=ComplexD(dist(gen),dist(gen));
 | 
			
		||||
  //  s=ComplexD(dist(gen),dist(gen));
 | 
			
		||||
  s.real(dist(gen));
 | 
			
		||||
  s.imag(dist(gen));
 | 
			
		||||
}
 | 
			
		||||
  
 | 
			
		||||
class GridRNGbase {
 | 
			
		||||
 
 | 
			
		||||
@@ -281,14 +281,17 @@ void GridBanner(void)
 | 
			
		||||
    printed=1;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
cudaDeviceProp *gpu_props;
 | 
			
		||||
#endif
 | 
			
		||||
void GridGpuInit(void)
 | 
			
		||||
{
 | 
			
		||||
#ifdef GRID_NVCC
 | 
			
		||||
  int nDevices = 1;
 | 
			
		||||
  cudaGetDeviceCount(&nDevices);
 | 
			
		||||
  gpu_props = new cudaDeviceProp[nDevices];
 | 
			
		||||
 | 
			
		||||
  char * localRankStr = NULL;
 | 
			
		||||
 | 
			
		||||
  int rank = 0, device = 0, world_rank=0; 
 | 
			
		||||
#define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 | 
			
		||||
@@ -324,19 +327,21 @@ void GridGpuInit(void)
 | 
			
		||||
#define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d");
 | 
			
		||||
 | 
			
		||||
      cudaDeviceProp prop; 
 | 
			
		||||
      cudaGetDeviceProperties(&prop, i);
 | 
			
		||||
      //      cudaGetDeviceProperties(&prop, i);
 | 
			
		||||
      cudaGetDeviceProperties(&gpu_props[i], i);
 | 
			
		||||
      prop = gpu_props[i];
 | 
			
		||||
      printf("GpuInit: ========================\n");
 | 
			
		||||
      printf("GpuInit: Device Number    : %d\n", i);
 | 
			
		||||
      printf("GpuInit: ========================\n");
 | 
			
		||||
      printf("GpuInit: Device identifier: %s\n", prop.name);
 | 
			
		||||
      printf("GpuInit:   Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
 | 
			
		||||
      //      printf("GpuInit:   Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
 | 
			
		||||
      GPU_PROP(managedMemory);
 | 
			
		||||
      GPU_PROP(isMultiGpuBoard);
 | 
			
		||||
      GPU_PROP(warpSize);
 | 
			
		||||
#if 0
 | 
			
		||||
      GPU_PROP(unifiedAddressing);
 | 
			
		||||
      GPU_PROP(isMultiGpuBoard);
 | 
			
		||||
      GPU_PROP(l2CacheSize);
 | 
			
		||||
      GPU_PROP(singleToDoublePrecisionPerfRatio);
 | 
			
		||||
      GPU_PROP(warpSize);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
    printf("GpuInit: ================================================\n");
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user