1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-14 01:35:36 +00:00

Boost precision in inner products in single

This commit is contained in:
Peter Boyle 2020-06-24 12:52:31 -04:00
parent 093d1ee21b
commit 22cfbdbbb3

View File

@ -93,7 +93,9 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum; typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
@ -154,7 +156,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. // Might make all code paths go this way.
typedef decltype(innerProduct(vobj(),vobj())) inner_t; typedef decltype(Reduce(innerProductD(vobj(),vobj()))) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
@ -163,16 +165,15 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
autoView( right_v,right, AcceleratorRead); autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance... // GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto x_l = left_v(ss); auto x_l = left_v[ss];
auto y_l = right_v(ss); auto y_l = right_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); inner_tmp_v[ss]=Reduce(innerProductD(x_l,y_l));
}) });
} }
// This is in single precision and fails some tests // This is in single precision and fails some tests
// Need a sumD that sums in double nrm = TensorRemove(sum(inner_tmp_v,sites));
nrm = TensorRemove(sumD(inner_tmp_v,sites));
return nrm; return nrm;
} }
@ -218,16 +219,16 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(Reduce(innerProductD(x_v[0],y_v[0]))) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v[ss]+b*y_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); inner_tmp_v[ss]=Reduce(innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp); z_v[ss]=tmp;
}); });
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
@ -243,29 +244,28 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// GPU // GPU
typedef decltype(innerProduct(vobj(),vobj())) inner_t; typedef decltype(Reduce(innerProductD(vobj(),vobj()))) inner_t;
typedef decltype(innerProduct(vobj(),vobj())) norm_t; typedef decltype(Reduce(innerProductD(vobj(),vobj()))) norm_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{ {
autoView(left_v,left, AcceleratorRead); autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead); autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto left_tmp = left_v(ss); auto left_tmp = left_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); inner_tmp_v[ss]=Reduce(innerProductD(left_tmp,right_v[ss]));
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); norm_tmp_v [ss]=Reduce(innerProductD(left_tmp,left_tmp));
}); });
} }
tmp[0] = TensorRemove(sumD(inner_tmp_v,sites)); tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD(norm_tmp_v,sites)); tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0]; ip = tmp[0];