mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Boost precision in inner products in single
This commit is contained in:
parent
093d1ee21b
commit
22cfbdbbb3
@ -93,7 +93,9 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
||||
ssum = ssum+sumarray[i];
|
||||
}
|
||||
|
||||
return ssum;
|
||||
typedef typename vobj::scalar_object ssobj;
|
||||
ssobj ret = ssum;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -154,7 +156,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
// Might make all code paths go this way.
|
||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
||||
typedef decltype(Reduce(innerProductD(vobj(),vobj()))) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
@ -163,16 +165,15 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
||||
autoView( right_v,right, AcceleratorRead);
|
||||
|
||||
// GPU - SIMT lane compliance...
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto x_l = left_v(ss);
|
||||
auto y_l = right_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
||||
})
|
||||
accelerator_for( ss, sites, 1,{
|
||||
auto x_l = left_v[ss];
|
||||
auto y_l = right_v[ss];
|
||||
inner_tmp_v[ss]=Reduce(innerProductD(x_l,y_l));
|
||||
});
|
||||
}
|
||||
|
||||
// This is in single precision and fails some tests
|
||||
// Need a sumD that sums in double
|
||||
nrm = TensorRemove(sumD(inner_tmp_v,sites));
|
||||
nrm = TensorRemove(sum(inner_tmp_v,sites));
|
||||
return nrm;
|
||||
}
|
||||
|
||||
@ -218,16 +219,16 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
autoView( y_v, y, AcceleratorRead);
|
||||
autoView( z_v, z, AcceleratorWrite);
|
||||
|
||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||
typedef decltype(Reduce(innerProductD(x_v[0],y_v[0]))) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
accelerator_for( ss, sites, 1,{
|
||||
auto tmp = a*x_v[ss]+b*y_v[ss];
|
||||
inner_tmp_v[ss]=Reduce(innerProductD(tmp,tmp));
|
||||
z_v[ss]=tmp;
|
||||
});
|
||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||
grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
}
|
||||
@ -243,29 +244,28 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
||||
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
|
||||
const uint64_t nsimd = grid->Nsimd();
|
||||
const uint64_t sites = grid->oSites();
|
||||
|
||||
// GPU
|
||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
||||
typedef decltype(innerProduct(vobj(),vobj())) norm_t;
|
||||
typedef decltype(Reduce(innerProductD(vobj(),vobj()))) inner_t;
|
||||
typedef decltype(Reduce(innerProductD(vobj(),vobj()))) norm_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
Vector<norm_t> norm_tmp(sites);
|
||||
Vector<norm_t> norm_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
auto norm_tmp_v = &norm_tmp[0];
|
||||
{
|
||||
autoView(left_v,left, AcceleratorRead);
|
||||
autoView(right_v,right,AcceleratorRead);
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto left_tmp = left_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
|
||||
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
|
||||
accelerator_for( ss, sites, 1,{
|
||||
auto left_tmp = left_v[ss];
|
||||
inner_tmp_v[ss]=Reduce(innerProductD(left_tmp,right_v[ss]));
|
||||
norm_tmp_v [ss]=Reduce(innerProductD(left_tmp,left_tmp));
|
||||
});
|
||||
}
|
||||
|
||||
tmp[0] = TensorRemove(sumD(inner_tmp_v,sites));
|
||||
tmp[1] = TensorRemove(sumD(norm_tmp_v,sites));
|
||||
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
|
||||
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
|
||||
|
||||
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
||||
ip = tmp[0];
|
||||
|
Loading…
Reference in New Issue
Block a user