From 3e49dc8a671b6b6eeff8d63513876a9e01b6f6ca Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 14 Aug 2019 15:18:34 +0100 Subject: [PATCH] Reduction finished and hopefully fixes CI regression fail on single precisoin and force --- Grid/lattice/Lattice_reduction.h | 123 ++++++++++++++++++--------- Grid/lattice/Lattice_reduction_gpu.h | 69 ++++++++------- 2 files changed, 119 insertions(+), 73 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index a5366435..c8f4e7b9 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -29,55 +29,58 @@ Author: paboyle #endif NAMESPACE_BEGIN(Grid); + +////////////////////////////////////////////////////// +// FIXME this should promote to double and accumulate +////////////////////////////////////////////////////// template -inline typename vobj::scalar_object sum_cpu(const Lattice &arg) +inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) { - GridBase *grid=arg.Grid(); - int Nsimd = grid->Nsimd(); - - Vector sumarray(grid->SumArraySize()); - for(int i=0;iSumArraySize();i++){ + typedef typename vobj::scalar_object sobj; + + const int Nsimd = vobj::Nsimd(); + const int nthread = GridThread::GetThreads(); + + Vector sumarray(nthread); + for(int i=0;iSumArraySize(), { + thread_for(thr,nthread, { int nwork, mywork, myoff; - nwork = grid->oSites(); + nwork = osites; GridThread::GetWork(nwork,thr,mywork,myoff); - vobj vvsum=Zero(); for(int ss=myoff;ssSumArraySize();i++){ - vsum = vsum+sumarray[i]; + sobj ssum=Zero(); // sum across threads + for(int i=0;i buf(Nsimd); - extract(vsum,buf); - - for(int i=0;iGlobalSum(ssum); - return ssum; } - +template +inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) +{ +#ifdef GRID_NVCC + return sum_gpu(arg,osites); +#else + return sum_cpu(arg,osites); +#endif +} template inline typename vobj::scalar_object sum(const Lattice &arg) { -#ifdef GRID_NVCC - return sum_gpu(arg); -#else - return sum_cpu(arg); -#endif + auto arg_v = arg.View(); + Integer osites = arg.Grid()->oSites(); + auto ssum= sum(&arg_v[0],osites); + arg.Grid()->GlobalSum(ssum); + return ssum; } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -94,7 +97,7 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; - scalar_type nrm; + ComplexD nrm; GridBase *grid = left.Grid(); @@ -104,20 +107,43 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); + + typename vobj::scalar_object f; + typename vobj::scalar_objectD d; + f=Zero(); + d=f; +#ifdef GRID_NVCC + // GPU - SIMT lane compliance... typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; - Lattice inner_tmp(grid); - auto inner_tmp_v = inner_tmp.View(); + Vector inner_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + accelerator_for( ss, sites, nsimd,{ auto x_l = left_v(ss); auto y_l = right_v(ss); coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); }) - nrm = TensorRemove(sum(inner_tmp)); + // This is in single precision and fails some tests + // Need a sumD that sums in double + nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites)); +#else + // CPU + typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; + Vector inner_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto x_l = left_v[ss]; + auto y_l = right_v[ss]; + inner_tmp_v[ss]=innerProductD(x_l,y_l); + }) + nrm = TensorRemove(sum(inner_tmp_v,sites)); +#endif + grid->GlobalSum(nrm); - // right.Grid()->GlobalSum(nrm); return nrm; } @@ -153,19 +179,34 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); +#ifdef GRID_NVCC + // GPU typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; - Lattice inner_tmp(grid); - auto inner_tmp_v = inner_tmp.View(); + Vector inner_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; accelerator_for( ss, sites, nsimd,{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(z_v[ss],tmp); - }) - - nrm = real(TensorRemove(sum(inner_tmp))); + }); - // z.Grid()->GlobalSum(nrm); + nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites))); +#else + // CPU + typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; + Vector inner_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto tmp = a*x_v(ss)+b*y_v(ss); + inner_tmp_v[ss]=innerProductD(tmp,tmp); + z_v[ss]=tmp; + }); + // Already promoted to double + nrm = real(TensorRemove(sum(inner_tmp_v,sites))); +#endif + grid->GlobalSum(nrm); return nrm; } diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index d2906492..c5d75356 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -82,13 +82,11 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid __syncthreads(); } -template -__device__ void reduceBlocks(const LatticeView g_idata, typename vobj::scalar_object *g_odata, Iterator n) { - - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_object sobj; - constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type); + +template +__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n) +{ + constexpr Iterator nsimd = vobj::Nsimd(); Iterator blockSize = blockDim.x; @@ -109,13 +107,16 @@ __device__ void reduceBlocks(const LatticeView g_idata, typename vobj::sca Iterator lane = i % nsimd; Iterator ss = i / nsimd; auto tmp = extractLane(lane,g_idata[ss]); - mySum += tmp; + sobj tmpD; + tmpD=tmp; + mySum +=tmpD; if (i + blockSize < n) { lane = (i+blockSize) % nsimd; ss = (i+blockSize) / nsimd; tmp = extractLane(lane,g_idata[ss]); - mySum += tmp; + tmpD = tmp; + mySum += tmpD; } i += gridSize; } @@ -126,9 +127,8 @@ __device__ void reduceBlocks(const LatticeView g_idata, typename vobj::sca if (tid == 0) g_odata[blockIdx.x] = sdata[0]; } -template -__global__ void reduceKernel(const LatticeView lat, typename vobj::scalar_object *buffer, Iterator n) { - typedef typename vobj::scalar_object sobj; +template +__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { Iterator blockSize = blockDim.x; @@ -179,32 +179,26 @@ __global__ void reduceKernel(const LatticeView lat, typename vobj::scalar_ } } +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Possibly promote to double and sum +///////////////////////////////////////////////////////////////////////////////////////////////////////// template -inline typename vobj::scalar_object sum_gpu(const Lattice &lat) +inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) { + typedef typename vobj::scalar_objectD sobj; + typedef decltype(lat) Iterator; - LatticeView lat_v = lat.View(); - - typedef typename vobj::scalar_object sobj; - typedef decltype(lat_v.begin()) Iterator; - - Iterator size = lat.Grid()->lSites(); - - Iterator numThreads, numBlocks; + Integer nsimd= vobj::Nsimd(); + Integer size = osites*nsimd; + + Integer numThreads, numBlocks; getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); - Iterator smemSize = numThreads * sizeof(sobj); - /* - std::cout << GridLogDebug << "Starting reduction with:" << std::endl; - std::cout << GridLogDebug << "\tsize = " << size << std::endl; - std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl; - std::cout << GridLogDebug << "\tnumBlocks = " << numBlocks << std::endl; - std::cout << GridLogDebug << "\tsmemSize = " << smemSize << std::endl; - */ + Integer smemSize = numThreads * sizeof(sobj); Vector buffer(numBlocks); sobj *buffer_v = &buffer[0]; - reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size); + reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); cudaDeviceSynchronize(); cudaError err = cudaGetLastError(); @@ -212,10 +206,21 @@ inline typename vobj::scalar_object sum_gpu(const Lattice &lat) printf("Cuda error %s\n",cudaGetErrorString( err )); exit(0); } - auto result = buffer_v[0]; - lat.Grid()->GlobalSum(result); + return result; +} +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Return as same precision as input performing reduction in double precision though +///////////////////////////////////////////////////////////////////////////////////////////////////////// +template +inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu(lat,osites); return result; } + + NAMESPACE_END(Grid);