1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Reduction finished and hopefully fixes CI regression fail on single precisoin and force

This commit is contained in:
Peter Boyle 2019-08-14 15:18:34 +01:00
parent 96ac56cace
commit 3e49dc8a67
2 changed files with 119 additions and 73 deletions

View File

@ -29,55 +29,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
// FIXME this should promote to double and accumulate
//////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum_cpu(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{ {
GridBase *grid=arg.Grid(); typedef typename vobj::scalar_object sobj;
int Nsimd = grid->Nsimd();
const int Nsimd = vobj::Nsimd();
Vector<vobj> sumarray(grid->SumArraySize()); const int nthread = GridThread::GetThreads();
for(int i=0;i<grid->SumArraySize();i++){
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
auto arg_v=arg.View(); thread_for(thr,nthread, {
thread_for(thr,grid->SumArraySize(), {
int nwork, mywork, myoff; int nwork, mywork, myoff;
nwork = grid->oSites(); nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff); GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero(); vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){ for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg_v[ss]; vvsum = vvsum + arg[ss];
} }
sumarray[thr]=vvsum; sumarray[thr]=Reduce(vvsum);
}); });
vobj vsum=Zero(); // sum across threads sobj ssum=Zero(); // sum across threads
for(int i=0;i<grid->SumArraySize();i++){ for(int i=0;i<nthread;i++){
vsum = vsum+sumarray[i]; ssum = ssum+sumarray[i];
} }
typedef typename vobj::scalar_object sobj;
sobj ssum=Zero();
ExtractBuffer<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
#ifdef GRID_NVCC auto arg_v = arg.View();
return sum_gpu(arg); Integer osites = arg.Grid()->oSites();
#else auto ssum= sum(&arg_v[0],osites);
return sum_cpu(arg); arg.Grid()->GlobalSum(ssum);
#endif return ssum;
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -94,7 +97,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
scalar_type nrm; ComplexD nrm;
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
@ -104,20 +107,43 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
typename vobj::scalar_object f;
typename vobj::scalar_objectD d;
f=Zero();
d=f;
#ifdef GRID_NVCC
// GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Lattice<inner_t> inner_tmp(grid); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = inner_tmp.View(); auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss); auto x_l = left_v(ss);
auto y_l = right_v(ss); auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
}) })
nrm = TensorRemove(sum(inner_tmp)); // This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
// right.Grid()->GlobalSum(nrm);
return nrm; return nrm;
} }
@ -153,19 +179,34 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Lattice<inner_t> inner_tmp(grid); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = inner_tmp.View(); auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}) });
nrm = real(TensorRemove(sum(inner_tmp)));
// z.Grid()->GlobalSum(nrm); nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
});
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm; return nrm;
} }

View File

@ -82,13 +82,11 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
__syncthreads(); __syncthreads();
} }
template <class vobj, class Iterator>
__device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::scalar_object *g_odata, Iterator n) { template <class vobj, class sobj, class Iterator>
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
typedef typename vobj::scalar_type scalar_type; {
typedef typename vobj::vector_type vector_type; constexpr Iterator nsimd = vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type);
Iterator blockSize = blockDim.x; Iterator blockSize = blockDim.x;
@ -109,13 +107,16 @@ __device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::sca
Iterator lane = i % nsimd; Iterator lane = i % nsimd;
Iterator ss = i / nsimd; Iterator ss = i / nsimd;
auto tmp = extractLane(lane,g_idata[ss]); auto tmp = extractLane(lane,g_idata[ss]);
mySum += tmp; sobj tmpD;
tmpD=tmp;
mySum +=tmpD;
if (i + blockSize < n) { if (i + blockSize < n) {
lane = (i+blockSize) % nsimd; lane = (i+blockSize) % nsimd;
ss = (i+blockSize) / nsimd; ss = (i+blockSize) / nsimd;
tmp = extractLane(lane,g_idata[ss]); tmp = extractLane(lane,g_idata[ss]);
mySum += tmp; tmpD = tmp;
mySum += tmpD;
} }
i += gridSize; i += gridSize;
} }
@ -126,9 +127,8 @@ __device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::sca
if (tid == 0) g_odata[blockIdx.x] = sdata[0]; if (tid == 0) g_odata[blockIdx.x] = sdata[0];
} }
template <class vobj, class Iterator> template <class vobj, class sobj,class Iterator>
__global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_object *buffer, Iterator n) { __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
typedef typename vobj::scalar_object sobj;
Iterator blockSize = blockDim.x; Iterator blockSize = blockDim.x;
@ -179,32 +179,26 @@ __global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_
} }
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj> template <class vobj>
inline typename vobj::scalar_object sum_gpu(const Lattice<vobj> &lat) inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
LatticeView<vobj> lat_v = lat.View(); Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
typedef typename vobj::scalar_object sobj;
typedef decltype(lat_v.begin()) Iterator; Integer numThreads, numBlocks;
Iterator size = lat.Grid()->lSites();
Iterator numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Iterator smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
/*
std::cout << GridLogDebug << "Starting reduction with:" << std::endl;
std::cout << GridLogDebug << "\tsize = " << size << std::endl;
std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl;
std::cout << GridLogDebug << "\tnumBlocks = " << numBlocks << std::endl;
std::cout << GridLogDebug << "\tsmemSize = " << smemSize << std::endl;
*/
Vector<sobj> buffer(numBlocks); Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cudaError err = cudaGetLastError(); cudaError err = cudaGetLastError();
@ -212,10 +206,21 @@ inline typename vobj::scalar_object sum_gpu(const Lattice<vobj> &lat)
printf("Cuda error %s\n",cudaGetErrorString( err )); printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0); exit(0);
} }
auto result = buffer_v[0]; auto result = buffer_v[0];
lat.Grid()->GlobalSum(result); return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result; return result;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);