From 9dad7a009486bf67d202b223702e93610a8192ca Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 30 Jul 2019 00:14:12 +0100 Subject: [PATCH] Reproducible reduction and axpy_norm offload from Gianluca. Hopefully get CG running entirely on GPU --- Grid/lattice/Lattice_reduction.h | 262 ++++++++++++--------------- Grid/lattice/Lattice_reduction_gpu.h | 221 ++++++++++++++++++++++ Grid/lattice/Lattice_rng.h | 8 +- Grid/util/Init.cc | 15 +- 4 files changed, 355 insertions(+), 151 deletions(-) create mode 100644 Grid/lattice/Lattice_reduction_gpu.h diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index ca4583d3..7ec25410 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -23,154 +23,12 @@ Author: paboyle #include #ifdef GRID_NVCC -#include +#include #endif NAMESPACE_BEGIN(Grid); - //////////////////////////////////////////////////////////////////////////////////////////////////// - // Deterministic Reduction operations - //////////////////////////////////////////////////////////////////////////////////////////////////// -template inline RealD norm2(const Lattice &arg){ - ComplexD nrm = innerProduct(arg,arg); - return real(nrm); -} - -#ifdef GRID_NVCC -template -struct innerProductFunctor : public thrust::binary_function -{ - accelerator R operator()(T x, T y) { return innerProduct(x,y); } -}; -#endif - -// Double inner product -template -inline ComplexD innerProduct(const Lattice &left,const Lattice &right) -{ - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_typeD vector_type; - scalar_type nrm; - - GridBase *grid = left.Grid(); - - Vector sumarray(grid->SumArraySize()); - - auto left_v = left.View(); - auto right_v=right.View(); - -#if 0 - typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; - thrust::plus binary_sum; - innerProductFunctor binary_inner_p; - Integer sN = left_v.end(); - inner_t zero = Zero(); - // is there a way of using the efficient thrust reduction while maintaining memory coalescing? - inner_t vnrm = thrust::inner_product(thrust::device, &left_v[0], &left_v[sN], &right_v[0], zero, binary_sum, binary_inner_p); - nrm = Reduce(TensorRemove(vnrm));// sum across simd -#else - thread_for( thr,grid->SumArraySize(),{ - int mywork, myoff; - GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff); - - decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation - for(int ss=myoff;ssSumArraySize();i++){ - vvnrm = vvnrm+sumarray[i]; - } - nrm = Reduce(vvnrm);// sum across simd -#endif - right.Grid()->GlobalSum(nrm); - return nrm; -} - -///////////////////////// -// Fast axpby_norm -// z = a x + b y -// return norm z -///////////////////////// -template strong_inline RealD -axpy_norm_fast(Lattice &z,sobj a,const Lattice &x,const Lattice &y) -{ - sobj one(1.0); - return axpby_norm_fast(z,a,one,x,y); -} - -template strong_inline RealD -axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Lattice &y) -{ - const int pad = 8; - z.Checkerboard() = x.Checkerboard(); - conformable(z,x); - conformable(x,y); - - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_typeD vector_type; - RealD nrm; - - GridBase *grid = x.Grid(); - - Vector sumarray(grid->SumArraySize()*pad); - - auto x_v=x.View(); - auto y_v=y.View(); - auto z_v=z.View(); - thread_for(thr,grid->SumArraySize(), - { - int nwork, mywork, myoff; - nwork = x.Grid()->oSites(); - GridThread::GetWork(nwork,thr,mywork,myoff); - - // private to thread; sub summation - decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); - for(int ss=myoff;ssSumArraySize();i++){ - nrm = nrm+sumarray[i*pad]; - } - z.Grid()->GlobalSum(nrm); - return nrm; -} - - -template -inline auto sum(const LatticeUnaryExpression & expr) - ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object -{ - return sum(closure(expr)); -} - -template -inline auto sum(const LatticeBinaryExpression & expr) - ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object -{ - return sum(closure(expr)); -} - - -template -inline auto sum(const LatticeTrinaryExpression & expr) - ->typename decltype(expr.op.func(eval(0,expr.arg1), - eval(0,expr.arg2), - eval(0,expr.arg3) - ))::scalar_object -{ - return sum(closure(expr)); -} - +#ifndef GRID_NVCC template inline typename vobj::scalar_object sum(const Lattice &arg) { @@ -211,8 +69,124 @@ inline typename vobj::scalar_object sum(const Lattice &arg) return ssum; } +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Deterministic Reduction operations +//////////////////////////////////////////////////////////////////////////////////////////////////// +template inline RealD norm2(const Lattice &arg){ + ComplexD nrm = innerProduct(arg,arg); + return real(nrm); +} + +// Double inner product +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +{ + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + scalar_type nrm; + + GridBase *grid = left.Grid(); + + // Might make all code paths go this way. + auto left_v = left.View(); + auto right_v=right.View(); + + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->oSites(); + + typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; + Lattice inner_tmp(grid); + auto inner_tmp_v = inner_tmp.View(); + + accelerator_for( ss, sites, nsimd,{ + auto x_l = left_v(ss); + auto y_l = right_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); + }) + + nrm = TensorRemove(sum(inner_tmp)); + + right.Grid()->GlobalSum(nrm); + return nrm; +} + +///////////////////////// +// Fast axpby_norm +// z = a x + b y +// return norm z +///////////////////////// +template strong_inline RealD +axpy_norm_fast(Lattice &z,sobj a,const Lattice &x,const Lattice &y) +{ + sobj one(1.0); + return axpby_norm_fast(z,a,one,x,y); +} + +template strong_inline RealD +axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Lattice &y) +{ + const int pad = 8; + z.Checkerboard() = x.Checkerboard(); + conformable(z,x); + conformable(x,y); + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + RealD nrm; + + GridBase *grid = x.Grid(); + + auto x_v=x.View(); + auto y_v=y.View(); + auto z_v=z.View(); + + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->oSites(); + + typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; + Lattice inner_tmp(grid); + + accelerator_for( ss, sites, nsimd,{ + auto tmp = a*x_v(ss)+b*y_v(ss); + coalescedWrite(inner_tmp[ss],innerProduct(tmp,tmp)); + coalescedWrite(z_v[ss],tmp); + }) + + nrm = TensorRemove(sum(inner_tmp)); + + z.Grid()->GlobalSum(nrm); + return nrm; +} + + +template +inline auto sum(const LatticeUnaryExpression & expr) + ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object +{ + return sum(closure(expr)); +} + +template +inline auto sum(const LatticeBinaryExpression & expr) + ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object +{ + return sum(closure(expr)); +} + + +template +inline auto sum(const LatticeTrinaryExpression & expr) + ->typename decltype(expr.op.func(eval(0,expr.arg1), + eval(0,expr.arg2), + eval(0,expr.arg3) + ))::scalar_object +{ + return sum(closure(expr)); +} + ////////////////////////////////////////////////////////////////////////////////////////////////////////////// // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... ////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h new file mode 100644 index 00000000..bdd3566f --- /dev/null +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -0,0 +1,221 @@ +NAMESPACE_BEGIN(Grid); + +#define WARP_SIZE 32 +extern cudaDeviceProp *gpu_props; +__device__ unsigned int retirementCount = 0; + +template +unsigned int nextPow2(Iterator x) { + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return ++x; +} + +template +void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { + + int device; + cudaGetDevice(&device); + + Iterator warpSize = gpu_props[device].warpSize; + Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; + Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock; + Iterator multiProcessorCount = gpu_props[device].multiProcessorCount; + + std::cout << GridLogDebug << "GPU has:" << std::endl; + std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl; + std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl; + std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl; + std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl; + std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; + + if (warpSize != WARP_SIZE) { + std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl; + exit(EXIT_FAILURE); + } + + // let the number of threads in a block be a multiple of 2, starting from warpSize + threads = warpSize; + while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; + // keep all the streaming multiprocessors busy + blocks = nextPow2(multiProcessorCount); + +} + +template +__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) { + + Iterator blockSize = blockDim.x; + + // cannot use overloaded operators for sobj as they are not volatile-qualified + memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); + __syncwarp(); + + const Iterator VEC = WARP_SIZE; + const Iterator vid = tid & (VEC-1); + + sobj beta, temp; + memcpy((void *)&beta, (void *)&mySum, sizeof(sobj)); + + for (int i = VEC/2; i > 0; i>>=1) { + if (vid < i) { + memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj)); + beta += temp; + memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); + } + __syncwarp(); + } + __syncthreads(); + + if (threadIdx.x == 0) { + beta = Zero(); + for (Iterator i = 0; i < blockSize; i += VEC) { + memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj)); + beta += temp; + } + memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); + } + __syncthreads(); +} + +template +__device__ void reduceBlocks(const LatticeView g_idata, typename vobj::scalar_object *g_odata, Iterator n) { + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object sobj; + constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type); + + Iterator blockSize = blockDim.x; + + // force shared memory alignment + extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[]; + // it's not possible to have two extern __shared__ arrays with same name + // but different types in different scopes -- need to cast each time + sobj *sdata = (sobj *)shmem_pointer; + + // first level of reduction, + // each thread writes result in mySum + Iterator tid = threadIdx.x; + Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x; + Iterator gridSize = blockSize*2*gridDim.x; + sobj mySum = Zero(); + + while (i < n) { + Iterator lane = i % nsimd; + Iterator ss = i / nsimd; + auto tmp = extractLane(lane,g_idata[ss]); + mySum += tmp; + + if (i + blockSize < n) { + lane = (i+blockSize) % nsimd; + ss = (i+blockSize) / nsimd; + tmp = extractLane(lane,g_idata[ss]); + mySum += tmp; + } + i += gridSize; + } + + // copy mySum to shared memory and perform + // reduction for all threads in this block + reduceBlock(sdata, mySum, tid); + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + +template +__global__ void reduceKernel(const LatticeView lat, typename vobj::scalar_object *buffer, Iterator n) { + typedef typename vobj::scalar_object sobj; + + Iterator blockSize = blockDim.x; + + // perform reduction for this block and + // write result to global memory buffer + reduceBlocks(lat, buffer, n); + + if (gridDim.x > 1) { + + const Iterator tid = threadIdx.x; + __shared__ bool amLast; + // force shared memory alignment + extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[]; + // it's not possible to have two extern __shared__ arrays with same name + // but different types in different scopes -- need to cast each time + sobj *smem = (sobj *)shmem_pointer; + + // wait until all outstanding memory instructions in this thread are finished + __threadfence(); + + if (tid==0) { + unsigned int ticket = atomicInc(&retirementCount, gridDim.x); + // true if this block is the last block to be done + amLast = (ticket == gridDim.x-1); + } + + // each thread must read the correct value of amLast + __syncthreads(); + + if (amLast) { + // reduce buffer[0], ..., buffer[gridDim.x-1] + Iterator i = tid; + sobj mySum = Zero(); + + while (i < gridDim.x) { + mySum += buffer[i]; + i += blockSize; + } + + reduceBlock(smem, mySum, tid); + + if (tid==0) { + buffer[0] = smem[0]; + // reset count variable + retirementCount = 0; + } + } + } +} + +template +inline typename vobj::scalar_object sum(const Lattice &lat) +{ + + LatticeView lat_v = lat.View(); + + typedef typename vobj::scalar_object sobj; + typedef decltype(lat_v.begin()) Iterator; + + Iterator size = lat.Grid()->lSites(); + + Iterator numThreads, numBlocks; + getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + Iterator smemSize = numThreads * sizeof(sobj); + /* + std::cout << GridLogDebug << "Starting reduction with:" << std::endl; + std::cout << GridLogDebug << "\tsize = " << size << std::endl; + std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl; + std::cout << GridLogDebug << "\tnumBlocks = " << numBlocks << std::endl; + std::cout << GridLogDebug << "\tsmemSize = " << smemSize << std::endl; + */ + + Vector buffer(numBlocks); + sobj *buffer_v = &buffer[0]; + + reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size); + cudaDeviceSynchronize(); + + cudaError err = cudaGetLastError(); + if ( cudaSuccess != err ) { + printf("Cuda error %s\n",cudaGetErrorString( err )); + exit(0); + } + + auto result = buffer_v[0]; + lat.Grid()->GlobalSum(result); + return result; +} + +NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index ebeea812..1bb1f087 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -108,12 +108,16 @@ void fillScalar(scalar &s,distribution &dist,generator & gen) template void fillScalar(ComplexF &s,distribution &dist, generator &gen) { - s=ComplexF(dist(gen),dist(gen)); + // s=ComplexF(dist(gen),dist(gen)); + s.real(dist(gen)); + s.imag(dist(gen)); } template void fillScalar(ComplexD &s,distribution &dist,generator &gen) { - s=ComplexD(dist(gen),dist(gen)); + // s=ComplexD(dist(gen),dist(gen)); + s.real(dist(gen)); + s.imag(dist(gen)); } class GridRNGbase { diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index a8410d0f..4fa3e963 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -281,14 +281,17 @@ void GridBanner(void) printed=1; } } +#ifdef GRID_NVCC +cudaDeviceProp *gpu_props; +#endif void GridGpuInit(void) { #ifdef GRID_NVCC int nDevices = 1; cudaGetDeviceCount(&nDevices); + gpu_props = new cudaDeviceProp[nDevices]; char * localRankStr = NULL; - int rank = 0, device = 0, world_rank=0; #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" @@ -324,19 +327,21 @@ void GridGpuInit(void) #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, i); + // cudaGetDeviceProperties(&prop, i); + cudaGetDeviceProperties(&gpu_props[i], i); + prop = gpu_props[i]; printf("GpuInit: ========================\n"); printf("GpuInit: Device Number : %d\n", i); printf("GpuInit: ========================\n"); printf("GpuInit: Device identifier: %s\n", prop.name); - printf("GpuInit: Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); + // printf("GpuInit: Peak Memory Bandwidth (GB/s): %f\n",(float)2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); GPU_PROP(managedMemory); + GPU_PROP(isMultiGpuBoard); + GPU_PROP(warpSize); #if 0 GPU_PROP(unifiedAddressing); - GPU_PROP(isMultiGpuBoard); GPU_PROP(l2CacheSize); GPU_PROP(singleToDoublePrecisionPerfRatio); - GPU_PROP(warpSize); #endif } printf("GpuInit: ================================================\n");