mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-10 06:00:45 +01:00
Faster axpy_norm and innerProduct
This commit is contained in:
parent
bd99fd608c
commit
588c2f3cb1
@ -232,6 +232,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
// Might make all code paths go this way.
|
||||||
|
#if 0
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
@ -241,15 +242,31 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
autoView( right_v,right, AcceleratorRead);
|
autoView( right_v,right, AcceleratorRead);
|
||||||
|
|
||||||
// GPU - SIMT lane compliance...
|
// GPU - SIMT lane compliance...
|
||||||
accelerator_for( ss, sites, 1,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
auto x_l = left_v[ss];
|
auto x_l = left_v(ss);
|
||||||
auto y_l = right_v[ss];
|
auto y_l = right_v(ss);
|
||||||
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
{
|
||||||
|
autoView( left_v , left, AcceleratorRead);
|
||||||
|
autoView( right_v,right, AcceleratorRead);
|
||||||
|
|
||||||
|
// GPU - SIMT lane compliance...
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto x_l = left_v(ss);
|
||||||
|
auto y_l = right_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// This is in single precision and fails some tests
|
// This is in single precision and fails some tests
|
||||||
auto anrm = sum(inner_tmp_v,sites);
|
auto anrm = sumD(inner_tmp_v,sites);
|
||||||
nrm = anrm;
|
nrm = anrm;
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
@ -283,7 +300,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
// typedef typename vobj::vector_typeD vector_type;
|
||||||
RealD nrm;
|
RealD nrm;
|
||||||
|
|
||||||
GridBase *grid = x.Grid();
|
GridBase *grid = x.Grid();
|
||||||
@ -295,17 +312,29 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
autoView( x_v, x, AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
autoView( y_v, y, AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
autoView( z_v, z, AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
|
#if 0
|
||||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
accelerator_for( ss, sites, 1,{
|
accelerator_for( ss, sites, nsimd,{
|
||||||
auto tmp = a*x_v[ss]+b*y_v[ss];
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
|
||||||
z_v[ss]=tmp;
|
coalescedWrite(z_v[ss],tmp);
|
||||||
});
|
});
|
||||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
|
#else
|
||||||
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||||
|
coalescedWrite(z_v[ss],tmp);
|
||||||
|
});
|
||||||
|
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||||
|
#endif
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user