mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Accelerator loop use
This commit is contained in:
parent
6bf7f839ff
commit
f67830587f
@ -78,7 +78,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
auto arg_v = arg.View(AcceleratorRead);
|
auto arg_v = arg.View(AcceleratorRead);
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum_gpu(&arg_v[0],osites);
|
auto ssum= sum_gpu(&arg_v[0],osites);
|
||||||
@ -112,7 +112,6 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
#if defined(GRID_CUDA)
|
|
||||||
// Might make all code paths go this way.
|
// Might make all code paths go this way.
|
||||||
auto left_v = left.View(AcceleratorRead);
|
auto left_v = left.View(AcceleratorRead);
|
||||||
auto right_v=right.View(AcceleratorRead);
|
auto right_v=right.View(AcceleratorRead);
|
||||||
@ -121,7 +120,6 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
auto x_l = left_v(ss);
|
auto x_l = left_v(ss);
|
||||||
@ -131,22 +129,9 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
|
|
||||||
// This is in single precision and fails some tests
|
// This is in single precision and fails some tests
|
||||||
// Need a sumD that sums in double
|
// Need a sumD that sums in double
|
||||||
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
||||||
#else
|
#else
|
||||||
// Might make all code paths go this way.
|
|
||||||
auto left_v = left.View(CpuRead);
|
|
||||||
auto right_v=right.View(CpuRead);
|
|
||||||
|
|
||||||
// CPU
|
|
||||||
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
|
|
||||||
thread_for( ss, sites,{
|
|
||||||
auto x_l = left_v[ss];
|
|
||||||
auto y_l = right_v[ss];
|
|
||||||
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
|
||||||
})
|
|
||||||
nrm = TensorRemove(sum_cpu(inner_tmp_v,sites));
|
nrm = TensorRemove(sum_cpu(inner_tmp_v,sites));
|
||||||
#endif
|
#endif
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
@ -182,7 +167,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
|
||||||
// GPU
|
// GPU
|
||||||
auto x_v=x.View(AcceleratorRead);
|
auto x_v=x.View(AcceleratorRead);
|
||||||
auto y_v=y.View(AcceleratorRead);
|
auto y_v=y.View(AcceleratorRead);
|
||||||
@ -197,23 +181,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||||
coalescedWrite(z_v[ss],tmp);
|
coalescedWrite(z_v[ss],tmp);
|
||||||
});
|
});
|
||||||
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
|
||||||
#else
|
#else
|
||||||
auto x_v=x.View(AcceleratorRead);
|
|
||||||
auto y_v=y.View(AcceleratorRead);
|
|
||||||
auto z_v=z.View(AcceleratorWrite);
|
|
||||||
|
|
||||||
// CPU
|
|
||||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
|
||||||
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
|
||||||
z_v[ss]=tmp;
|
|
||||||
});
|
|
||||||
// Already promoted to double
|
// Already promoted to double
|
||||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user