mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-27 14:15:55 +01:00
Reproducible reduction and axpy_norm offload from Gianluca.
Hopefully get CG running entirely on GPU
This commit is contained in:
parent
1282e1067f
commit
9dad7a0094
@ -23,154 +23,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#include <Grid/Grid_Eigen_Dense.h>
|
#include <Grid/Grid_Eigen_Dense.h>
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_NVCC
|
||||||
#include <thrust/inner_product.h>
|
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
#ifndef GRID_NVCC
|
||||||
// Deterministic Reduction operations
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
|
||||||
ComplexD nrm = innerProduct(arg,arg);
|
|
||||||
return real(nrm);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
|
||||||
template<class T, class R>
|
|
||||||
struct innerProductFunctor : public thrust::binary_function<T,T,R>
|
|
||||||
{
|
|
||||||
accelerator R operator()(T x, T y) { return innerProduct(x,y); }
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Double inner product
|
|
||||||
template<class vobj>
|
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
|
||||||
{
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
|
||||||
scalar_type nrm;
|
|
||||||
|
|
||||||
GridBase *grid = left.Grid();
|
|
||||||
|
|
||||||
Vector<vector_type> sumarray(grid->SumArraySize());
|
|
||||||
|
|
||||||
auto left_v = left.View();
|
|
||||||
auto right_v=right.View();
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
|
||||||
thrust::plus<inner_t> binary_sum;
|
|
||||||
innerProductFunctor<vobj,inner_t> binary_inner_p;
|
|
||||||
Integer sN = left_v.end();
|
|
||||||
inner_t zero = Zero();
|
|
||||||
// is there a way of using the efficient thrust reduction while maintaining memory coalescing?
|
|
||||||
inner_t vnrm = thrust::inner_product(thrust::device, &left_v[0], &left_v[sN], &right_v[0], zero, binary_sum, binary_inner_p);
|
|
||||||
nrm = Reduce(TensorRemove(vnrm));// sum across simd
|
|
||||||
#else
|
|
||||||
thread_for( thr,grid->SumArraySize(),{
|
|
||||||
int mywork, myoff;
|
|
||||||
GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
|
|
||||||
|
|
||||||
decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation
|
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
|
||||||
vnrm = vnrm + innerProductD(left_v[ss],right_v[ss]);
|
|
||||||
}
|
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
|
||||||
});
|
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=Zero(); // sum across threads
|
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
|
||||||
vvnrm = vvnrm+sumarray[i];
|
|
||||||
}
|
|
||||||
nrm = Reduce(vvnrm);// sum across simd
|
|
||||||
#endif
|
|
||||||
right.Grid()->GlobalSum(nrm);
|
|
||||||
return nrm;
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////
|
|
||||||
// Fast axpby_norm
|
|
||||||
// z = a x + b y
|
|
||||||
// return norm z
|
|
||||||
/////////////////////////
|
|
||||||
template<class sobj,class vobj> strong_inline RealD
|
|
||||||
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
|
||||||
{
|
|
||||||
sobj one(1.0);
|
|
||||||
return axpby_norm_fast(z,a,one,x,y);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class sobj,class vobj> strong_inline RealD
|
|
||||||
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
|
||||||
{
|
|
||||||
const int pad = 8;
|
|
||||||
z.Checkerboard() = x.Checkerboard();
|
|
||||||
conformable(z,x);
|
|
||||||
conformable(x,y);
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
|
||||||
RealD nrm;
|
|
||||||
|
|
||||||
GridBase *grid = x.Grid();
|
|
||||||
|
|
||||||
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
|
||||||
|
|
||||||
auto x_v=x.View();
|
|
||||||
auto y_v=y.View();
|
|
||||||
auto z_v=z.View();
|
|
||||||
thread_for(thr,grid->SumArraySize(),
|
|
||||||
{
|
|
||||||
int nwork, mywork, myoff;
|
|
||||||
nwork = x.Grid()->oSites();
|
|
||||||
GridThread::GetWork(nwork,thr,mywork,myoff);
|
|
||||||
|
|
||||||
// private to thread; sub summation
|
|
||||||
decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero();
|
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
|
||||||
vobj tmp = a*x_v[ss]+b*y_v[ss];
|
|
||||||
vnrm = vnrm + innerProductD(tmp,tmp);
|
|
||||||
vstream(z_v[ss],tmp);
|
|
||||||
}
|
|
||||||
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
|
||||||
});
|
|
||||||
|
|
||||||
nrm = 0.0; // sum across threads; linear in thread count but fast
|
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
|
||||||
nrm = nrm+sumarray[i*pad];
|
|
||||||
}
|
|
||||||
z.Grid()->GlobalSum(nrm);
|
|
||||||
return nrm;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Op,class T1>
|
|
||||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
|
||||||
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
|
|
||||||
{
|
|
||||||
return sum(closure(expr));
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Op,class T1,class T2>
|
|
||||||
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
|
|
||||||
->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
|
|
||||||
{
|
|
||||||
return sum(closure(expr));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Op,class T1,class T2,class T3>
|
|
||||||
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
|
||||||
->typename decltype(expr.op.func(eval(0,expr.arg1),
|
|
||||||
eval(0,expr.arg2),
|
|
||||||
eval(0,expr.arg3)
|
|
||||||
))::scalar_object
|
|
||||||
{
|
|
||||||
return sum(closure(expr));
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
@ -211,8 +69,124 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
|||||||
|
|
||||||
return ssum;
|
return ssum;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Deterministic Reduction operations
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
|
ComplexD nrm = innerProduct(arg,arg);
|
||||||
|
return real(nrm);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Double inner product
|
||||||
|
template<class vobj>
|
||||||
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
scalar_type nrm;
|
||||||
|
|
||||||
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
// Might make all code paths go this way.
|
||||||
|
auto left_v = left.View();
|
||||||
|
auto right_v=right.View();
|
||||||
|
|
||||||
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
|
Lattice<inner_t> inner_tmp(grid);
|
||||||
|
auto inner_tmp_v = inner_tmp.View();
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto x_l = left_v(ss);
|
||||||
|
auto y_l = right_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||||
|
})
|
||||||
|
|
||||||
|
nrm = TensorRemove(sum(inner_tmp));
|
||||||
|
|
||||||
|
right.Grid()->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// Fast axpby_norm
|
||||||
|
// z = a x + b y
|
||||||
|
// return norm z
|
||||||
|
/////////////////////////
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
sobj one(1.0);
|
||||||
|
return axpby_norm_fast(z,a,one,x,y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
const int pad = 8;
|
||||||
|
z.Checkerboard() = x.Checkerboard();
|
||||||
|
conformable(z,x);
|
||||||
|
conformable(x,y);
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
RealD nrm;
|
||||||
|
|
||||||
|
GridBase *grid = x.Grid();
|
||||||
|
|
||||||
|
auto x_v=x.View();
|
||||||
|
auto y_v=y.View();
|
||||||
|
auto z_v=z.View();
|
||||||
|
|
||||||
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
|
Lattice<inner_t> inner_tmp(grid);
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
|
coalescedWrite(inner_tmp[ss],innerProduct(tmp,tmp));
|
||||||
|
coalescedWrite(z_v[ss],tmp);
|
||||||
|
})
|
||||||
|
|
||||||
|
nrm = TensorRemove(sum(inner_tmp));
|
||||||
|
|
||||||
|
z.Grid()->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class Op,class T1>
|
||||||
|
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||||
|
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
|
||||||
|
{
|
||||||
|
return sum(closure(expr));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Op,class T1,class T2>
|
||||||
|
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
|
||||||
|
->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
|
||||||
|
{
|
||||||
|
return sum(closure(expr));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class Op,class T1,class T2,class T3>
|
||||||
|
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
||||||
|
->typename decltype(expr.op.func(eval(0,expr.arg1),
|
||||||
|
eval(0,expr.arg2),
|
||||||
|
eval(0,expr.arg3)
|
||||||
|
))::scalar_object
|
||||||
|
{
|
||||||
|
return sum(closure(expr));
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
221
Grid/lattice/Lattice_reduction_gpu.h
Normal file
221
Grid/lattice/Lattice_reduction_gpu.h
Normal file
@ -0,0 +1,221 @@
|
|||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
|
template <class Iterator>
|
||||||
|
unsigned int nextPow2(Iterator x) {
|
||||||
|
--x;
|
||||||
|
x |= x >> 1;
|
||||||
|
x |= x >> 2;
|
||||||
|
x |= x >> 4;
|
||||||
|
x |= x >> 8;
|
||||||
|
x |= x >> 16;
|
||||||
|
return ++x;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Iterator>
|
||||||
|
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||||
|
|
||||||
|
int device;
|
||||||
|
cudaGetDevice(&device);
|
||||||
|
|
||||||
|
Iterator warpSize = gpu_props[device].warpSize;
|
||||||
|
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
||||||
|
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
|
||||||
|
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << "GPU has:" << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
|
||||||
|
|
||||||
|
if (warpSize != WARP_SIZE) {
|
||||||
|
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// let the number of threads in a block be a multiple of 2, starting from warpSize
|
||||||
|
threads = warpSize;
|
||||||
|
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
|
||||||
|
// keep all the streaming multiprocessors busy
|
||||||
|
blocks = nextPow2(multiProcessorCount);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class sobj, class Iterator>
|
||||||
|
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
|
||||||
|
|
||||||
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
|
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||||
|
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||||
|
__syncwarp();
|
||||||
|
|
||||||
|
const Iterator VEC = WARP_SIZE;
|
||||||
|
const Iterator vid = tid & (VEC-1);
|
||||||
|
|
||||||
|
sobj beta, temp;
|
||||||
|
memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
|
||||||
|
|
||||||
|
for (int i = VEC/2; i > 0; i>>=1) {
|
||||||
|
if (vid < i) {
|
||||||
|
memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
|
||||||
|
beta += temp;
|
||||||
|
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||||
|
}
|
||||||
|
__syncwarp();
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
beta = Zero();
|
||||||
|
for (Iterator i = 0; i < blockSize; i += VEC) {
|
||||||
|
memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
|
||||||
|
beta += temp;
|
||||||
|
}
|
||||||
|
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj, class Iterator>
|
||||||
|
__device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::scalar_object *g_odata, Iterator n) {
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
|
||||||
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
|
// force shared memory alignment
|
||||||
|
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
|
||||||
|
// it's not possible to have two extern __shared__ arrays with same name
|
||||||
|
// but different types in different scopes -- need to cast each time
|
||||||
|
sobj *sdata = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
|
// first level of reduction,
|
||||||
|
// each thread writes result in mySum
|
||||||
|
Iterator tid = threadIdx.x;
|
||||||
|
Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
|
||||||
|
Iterator gridSize = blockSize*2*gridDim.x;
|
||||||
|
sobj mySum = Zero();
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
Iterator lane = i % nsimd;
|
||||||
|
Iterator ss = i / nsimd;
|
||||||
|
auto tmp = extractLane(lane,g_idata[ss]);
|
||||||
|
mySum += tmp;
|
||||||
|
|
||||||
|
if (i + blockSize < n) {
|
||||||
|
lane = (i+blockSize) % nsimd;
|
||||||
|
ss = (i+blockSize) / nsimd;
|
||||||
|
tmp = extractLane(lane,g_idata[ss]);
|
||||||
|
mySum += tmp;
|
||||||
|
}
|
||||||
|
i += gridSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy mySum to shared memory and perform
|
||||||
|
// reduction for all threads in this block
|
||||||
|
reduceBlock(sdata, mySum, tid);
|
||||||
|
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj, class Iterator>
|
||||||
|
__global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_object *buffer, Iterator n) {
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
|
// perform reduction for this block and
|
||||||
|
// write result to global memory buffer
|
||||||
|
reduceBlocks(lat, buffer, n);
|
||||||
|
|
||||||
|
if (gridDim.x > 1) {
|
||||||
|
|
||||||
|
const Iterator tid = threadIdx.x;
|
||||||
|
__shared__ bool amLast;
|
||||||
|
// force shared memory alignment
|
||||||
|
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
|
||||||
|
// it's not possible to have two extern __shared__ arrays with same name
|
||||||
|
// but different types in different scopes -- need to cast each time
|
||||||
|
sobj *smem = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
|
// wait until all outstanding memory instructions in this thread are finished
|
||||||
|
__threadfence();
|
||||||
|
|
||||||
|
if (tid==0) {
|
||||||
|
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
||||||
|
// true if this block is the last block to be done
|
||||||
|
amLast = (ticket == gridDim.x-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// each thread must read the correct value of amLast
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (amLast) {
|
||||||
|
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
||||||
|
Iterator i = tid;
|
||||||
|
sobj mySum = Zero();
|
||||||
|
|
||||||
|
while (i < gridDim.x) {
|
||||||
|
mySum += buffer[i];
|
||||||
|
i += blockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
reduceBlock(smem, mySum, tid);
|
||||||
|
|
||||||
|
if (tid==0) {
|
||||||
|
buffer[0] = smem[0];
|
||||||
|
// reset count variable
|
||||||
|
retirementCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &lat)
|
||||||
|
{
|
||||||
|
|
||||||
|
LatticeView<vobj> lat_v = lat.View();
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef decltype(lat_v.begin()) Iterator;
|
||||||
|
|
||||||
|
Iterator size = lat.Grid()->lSites();
|
||||||
|
|
||||||
|
Iterator numThreads, numBlocks;
|
||||||
|
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
||||||
|
Iterator smemSize = numThreads * sizeof(sobj);
|
||||||
|
/*
|
||||||
|
std::cout << GridLogDebug << "Starting reduction with:" << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tsize = " << size << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tnumBlocks = " << numBlocks << std::endl;
|
||||||
|
std::cout << GridLogDebug << "\tsmemSize = " << smemSize << std::endl;
|
||||||
|
*/
|
||||||
|
|
||||||
|
Vector<sobj> buffer(numBlocks);
|
||||||
|
sobj *buffer_v = &buffer[0];
|
||||||
|
|
||||||
|
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
|
cudaError err = cudaGetLastError();
|
||||||
|
if ( cudaSuccess != err ) {
|
||||||
|
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto result = buffer_v[0];
|
||||||
|
lat.Grid()->GlobalSum(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -108,12 +108,16 @@ void fillScalar(scalar &s,distribution &dist,generator & gen)
|
|||||||
template<class distribution,class generator>
|
template<class distribution,class generator>
|
||||||
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
|
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
|
||||||
{
|
{
|
||||||
s=ComplexF(dist(gen),dist(gen));
|
// s=ComplexF(dist(gen),dist(gen));
|
||||||
|
s.real(dist(gen));
|
||||||
|
s.imag(dist(gen));
|
||||||
}
|
}
|
||||||
template<class distribution,class generator>
|
template<class distribution,class generator>
|
||||||
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
|
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
|
||||||
{
|
{
|
||||||
s=ComplexD(dist(gen),dist(gen));
|
// s=ComplexD(dist(gen),dist(gen));
|
||||||
|
s.real(dist(gen));
|
||||||
|
s.imag(dist(gen));
|
||||||
}
|
}
|
||||||
|
|
||||||
class GridRNGbase {
|
class GridRNGbase {
|
||||||
|
@ -281,14 +281,17 @@ void GridBanner(void)
|
|||||||
printed=1;
|
printed=1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
cudaDeviceProp *gpu_props;
|
||||||
|
#endif
|
||||||
void GridGpuInit(void)
|
void GridGpuInit(void)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_NVCC
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
cudaGetDeviceCount(&nDevices);
|
cudaGetDeviceCount(&nDevices);
|
||||||
|
gpu_props = new cudaDeviceProp[nDevices];
|
||||||
|
|
||||||
char * localRankStr = NULL;
|
char * localRankStr = NULL;
|
||||||
|
|
||||||
int rank = 0, device = 0, world_rank=0;
|
int rank = 0, device = 0, world_rank=0;
|
||||||
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
|
||||||
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
|
||||||
@ -324,19 +327,21 @@ void GridGpuInit(void)
|
|||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
cudaGetDeviceProperties(&prop, i);
|
// cudaGetDeviceProperties(&prop, i);
|
||||||
|
cudaGetDeviceProperties(&gpu_props[i], i);
|
||||||
|
prop = gpu_props[i];
|
||||||
printf("GpuInit: ========================\n");
|
printf("GpuInit: ========================\n");
|
||||||
printf("GpuInit: Device Number : %d\n", i);
|
printf("GpuInit: Device Number : %d\n", i);
|
||||||
printf("GpuInit: ========================\n");
|
printf("GpuInit: ========================\n");
|
||||||
printf("GpuInit: Device identifier: %s\n", prop.name);
|
printf("GpuInit: Device identifier: %s\n", prop.name);
|
||||||
printf("GpuInit: Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
|
// printf("GpuInit: Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
|
||||||
GPU_PROP(managedMemory);
|
GPU_PROP(managedMemory);
|
||||||
|
GPU_PROP(isMultiGpuBoard);
|
||||||
|
GPU_PROP(warpSize);
|
||||||
#if 0
|
#if 0
|
||||||
GPU_PROP(unifiedAddressing);
|
GPU_PROP(unifiedAddressing);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
|
||||||
GPU_PROP(l2CacheSize);
|
GPU_PROP(l2CacheSize);
|
||||||
GPU_PROP(singleToDoublePrecisionPerfRatio);
|
GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
GPU_PROP(warpSize);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
printf("GpuInit: ================================================\n");
|
printf("GpuInit: ================================================\n");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user