mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Reduction finished and hopefully fixes CI regression fail on single precisoin and force
This commit is contained in:
parent
96ac56cace
commit
3e49dc8a67
@ -29,55 +29,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////
|
||||||
|
// FIXME this should promote to double and accumulate
|
||||||
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum_cpu(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
GridBase *grid=arg.Grid();
|
typedef typename vobj::scalar_object sobj;
|
||||||
int Nsimd = grid->Nsimd();
|
|
||||||
|
const int Nsimd = vobj::Nsimd();
|
||||||
Vector<vobj> sumarray(grid->SumArraySize());
|
const int nthread = GridThread::GetThreads();
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
|
||||||
|
Vector<sobj> sumarray(nthread);
|
||||||
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto arg_v=arg.View();
|
thread_for(thr,nthread, {
|
||||||
thread_for(thr,grid->SumArraySize(), {
|
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
nwork = grid->oSites();
|
nwork = osites;
|
||||||
GridThread::GetWork(nwork,thr,mywork,myoff);
|
GridThread::GetWork(nwork,thr,mywork,myoff);
|
||||||
|
|
||||||
vobj vvsum=Zero();
|
vobj vvsum=Zero();
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vvsum = vvsum + arg_v[ss];
|
vvsum = vvsum + arg[ss];
|
||||||
}
|
}
|
||||||
sumarray[thr]=vvsum;
|
sumarray[thr]=Reduce(vvsum);
|
||||||
});
|
});
|
||||||
|
|
||||||
vobj vsum=Zero(); // sum across threads
|
sobj ssum=Zero(); // sum across threads
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<nthread;i++){
|
||||||
vsum = vsum+sumarray[i];
|
ssum = ssum+sumarray[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
sobj ssum=Zero();
|
|
||||||
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
|
||||||
extract(vsum,buf);
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
|
|
||||||
arg.Grid()->GlobalSum(ssum);
|
|
||||||
|
|
||||||
return ssum;
|
return ssum;
|
||||||
}
|
}
|
||||||
|
template<class vobj>
|
||||||
|
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||||
|
{
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
return sum_gpu(arg,osites);
|
||||||
|
#else
|
||||||
|
return sum_cpu(arg,osites);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
auto arg_v = arg.View();
|
||||||
return sum_gpu(arg);
|
Integer osites = arg.Grid()->oSites();
|
||||||
#else
|
auto ssum= sum(&arg_v[0],osites);
|
||||||
return sum_cpu(arg);
|
arg.Grid()->GlobalSum(ssum);
|
||||||
#endif
|
return ssum;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -94,7 +97,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
scalar_type nrm;
|
ComplexD nrm;
|
||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
@ -104,20 +107,43 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
typename vobj::scalar_object f;
|
||||||
|
typename vobj::scalar_objectD d;
|
||||||
|
f=Zero();
|
||||||
|
d=f;
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
// GPU - SIMT lane compliance...
|
||||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
Lattice<inner_t> inner_tmp(grid);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = inner_tmp.View();
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
auto x_l = left_v(ss);
|
auto x_l = left_v(ss);
|
||||||
auto y_l = right_v(ss);
|
auto y_l = right_v(ss);
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||||
})
|
})
|
||||||
|
|
||||||
nrm = TensorRemove(sum(inner_tmp));
|
// This is in single precision and fails some tests
|
||||||
|
// Need a sumD that sums in double
|
||||||
|
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
||||||
|
#else
|
||||||
|
// CPU
|
||||||
|
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto x_l = left_v[ss];
|
||||||
|
auto y_l = right_v[ss];
|
||||||
|
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
||||||
|
})
|
||||||
|
nrm = TensorRemove(sum(inner_tmp_v,sites));
|
||||||
|
#endif
|
||||||
|
grid->GlobalSum(nrm);
|
||||||
|
|
||||||
// right.Grid()->GlobalSum(nrm);
|
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,19 +179,34 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
// GPU
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
Lattice<inner_t> inner_tmp(grid);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = inner_tmp.View();
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||||
coalescedWrite(z_v[ss],tmp);
|
coalescedWrite(z_v[ss],tmp);
|
||||||
})
|
});
|
||||||
|
|
||||||
nrm = real(TensorRemove(sum(inner_tmp)));
|
|
||||||
|
|
||||||
// z.Grid()->GlobalSum(nrm);
|
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
|
||||||
|
#else
|
||||||
|
// CPU
|
||||||
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
|
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
||||||
|
z_v[ss]=tmp;
|
||||||
|
});
|
||||||
|
// Already promoted to double
|
||||||
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
|
#endif
|
||||||
|
grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,13 +82,11 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj, class Iterator>
|
|
||||||
__device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::scalar_object *g_odata, Iterator n) {
|
template <class vobj, class sobj, class Iterator>
|
||||||
|
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
constexpr Iterator nsimd = vobj::Nsimd();
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
constexpr Iterator nsimd = sizeof(vector_type)/sizeof(scalar_type);
|
|
||||||
|
|
||||||
Iterator blockSize = blockDim.x;
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
@ -109,13 +107,16 @@ __device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::sca
|
|||||||
Iterator lane = i % nsimd;
|
Iterator lane = i % nsimd;
|
||||||
Iterator ss = i / nsimd;
|
Iterator ss = i / nsimd;
|
||||||
auto tmp = extractLane(lane,g_idata[ss]);
|
auto tmp = extractLane(lane,g_idata[ss]);
|
||||||
mySum += tmp;
|
sobj tmpD;
|
||||||
|
tmpD=tmp;
|
||||||
|
mySum +=tmpD;
|
||||||
|
|
||||||
if (i + blockSize < n) {
|
if (i + blockSize < n) {
|
||||||
lane = (i+blockSize) % nsimd;
|
lane = (i+blockSize) % nsimd;
|
||||||
ss = (i+blockSize) / nsimd;
|
ss = (i+blockSize) / nsimd;
|
||||||
tmp = extractLane(lane,g_idata[ss]);
|
tmp = extractLane(lane,g_idata[ss]);
|
||||||
mySum += tmp;
|
tmpD = tmp;
|
||||||
|
mySum += tmpD;
|
||||||
}
|
}
|
||||||
i += gridSize;
|
i += gridSize;
|
||||||
}
|
}
|
||||||
@ -126,9 +127,8 @@ __device__ void reduceBlocks(const LatticeView<vobj> g_idata, typename vobj::sca
|
|||||||
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
|
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class vobj, class Iterator>
|
template <class vobj, class sobj,class Iterator>
|
||||||
__global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_object *buffer, Iterator n) {
|
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
|
|
||||||
Iterator blockSize = blockDim.x;
|
Iterator blockSize = blockDim.x;
|
||||||
|
|
||||||
@ -179,32 +179,26 @@ __global__ void reduceKernel(const LatticeView<vobj> lat, typename vobj::scalar_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Possibly promote to double and sum
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_object sum_gpu(const Lattice<vobj> &lat)
|
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
|
typedef typename vobj::scalar_objectD sobj;
|
||||||
|
typedef decltype(lat) Iterator;
|
||||||
|
|
||||||
LatticeView<vobj> lat_v = lat.View();
|
Integer nsimd= vobj::Nsimd();
|
||||||
|
Integer size = osites*nsimd;
|
||||||
typedef typename vobj::scalar_object sobj;
|
|
||||||
typedef decltype(lat_v.begin()) Iterator;
|
Integer numThreads, numBlocks;
|
||||||
|
|
||||||
Iterator size = lat.Grid()->lSites();
|
|
||||||
|
|
||||||
Iterator numThreads, numBlocks;
|
|
||||||
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
|
||||||
Iterator smemSize = numThreads * sizeof(sobj);
|
Integer smemSize = numThreads * sizeof(sobj);
|
||||||
/*
|
|
||||||
std::cout << GridLogDebug << "Starting reduction with:" << std::endl;
|
|
||||||
std::cout << GridLogDebug << "\tsize = " << size << std::endl;
|
|
||||||
std::cout << GridLogDebug << "\tnumThreads = " << numThreads << std::endl;
|
|
||||||
std::cout << GridLogDebug << "\tnumBlocks = " << numBlocks << std::endl;
|
|
||||||
std::cout << GridLogDebug << "\tsmemSize = " << smemSize << std::endl;
|
|
||||||
*/
|
|
||||||
|
|
||||||
Vector<sobj> buffer(numBlocks);
|
Vector<sobj> buffer(numBlocks);
|
||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
|
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat_v, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
cudaError err = cudaGetLastError();
|
cudaError err = cudaGetLastError();
|
||||||
@ -212,10 +206,21 @@ inline typename vobj::scalar_object sum_gpu(const Lattice<vobj> &lat)
|
|||||||
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto result = buffer_v[0];
|
auto result = buffer_v[0];
|
||||||
lat.Grid()->GlobalSum(result);
|
return result;
|
||||||
|
}
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Return as same precision as input performing reduction in double precision though
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
sobj result;
|
||||||
|
result = sumD_gpu(lat,osites);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
Loading…
Reference in New Issue
Block a user