1
0
mirror of https://github.com/aportelli/LatAnalyze.git synced 2024-11-10 00:45:36 +00:00

Histogram: percentiles & confidence interval

This commit is contained in:
Antonin Portelli 2015-11-24 16:27:47 +00:00
parent 3f28430e4a
commit 9be76b11d7
3 changed files with 132 additions and 26 deletions

View File

@ -51,6 +51,14 @@ int main(void)
}
h.setFromData(gauss, -5., 5., 40);
h.normalize();
cout << " median= " << h.median() << endl;
for (double s = 1.; s < 5.; ++s)
{
auto ci = h.confidenceInterval(s);
cout << static_cast<int>(s) << " sigma(s) interval= [";
cout << ci.first << ", " << ci.second << "]" << endl;
}
p << PlotHistogram(h);
p << PlotFunction(compile("return exp(-x_0^2/2)/sqrt(2*pi);", 1), -5., 5.);
p.display();

View File

@ -19,6 +19,9 @@
#include <LatAnalyze/Histogram.hpp>
#include <LatAnalyze/includes.hpp>
#include <gsl/gsl_histogram.h>
#include <gsl/gsl_sf.h>
#include <gsl/gsl_sort.h>
using namespace std;
using namespace Latan;
@ -64,38 +67,47 @@ void Histogram::setFromData(const DVec &data, const DVec &w, const double xMin,
LATAN_ERROR(Size, "data vector and weight vector size mismatch");
}
resize(nBin);
DECL_GSL_HIST(h);
data_ = data.array();
w_ = w.array();
xMax_ = xMax;
xMin_ = xMin;
gsl_histogram_set_ranges_uniform(&h, xMin, xMax);
FOR_VEC(data, i)
{
gsl_histogram_accumulate(&h, data(i), w(i));
}
total_ = w.sum();
computeNorm();
makeHistogram();
}
void Histogram::setFromData(const DVec &data, const double xMin,
const double xMax, const Index nBin)
{
resize(nBin);
DECL_GSL_HIST(h);
data_ = data.array();
xMax_ = xMax;
xMin_ = xMin;
gsl_histogram_set_ranges_uniform(&h, xMin, xMax);
FOR_VEC(data, i)
w_.resize(data.size());
w_.fill(1.);
makeHistogram();
}
// histogram calculation ///////////////////////////////////////////////////////
void Histogram::makeHistogram(void)
{
DECL_GSL_HIST(h);
gsl_histogram_set_ranges_uniform(&h, xMin_, xMax_);
FOR_STAT_ARRAY(data_, i)
{
gsl_histogram_increment(&h, data(i));
gsl_histogram_accumulate(&h, data_[i], w_[i]);
}
total_ = static_cast<double>(data.size());
total_ = w_.sum();
sortIndices();
computeNorm();
}
// generate sorted indices /////////////////////////////////////////////////////
void Histogram::sortIndices(void)
{
sInd_.resize(data_.size());
gsl_sort_index(sInd_.data(), data_.data(), 1, data_.size());
}
// compute normalization factor ////////////////////////////////////////////////
void Histogram::computeNorm(void)
{
@ -119,6 +131,16 @@ Index Histogram::size(void) const
return bin_.size();
}
const StatArray<double> & Histogram::getData(void) const
{
return data_;
}
const StatArray<double> & Histogram::getWeight(void) const
{
return w_;
}
double Histogram::getX(const Index i) const
{
return x_(i);
@ -139,3 +161,68 @@ double Histogram::operator()(const double x) const
return (*this)[static_cast<Index>(i)];
}
// percentiles & confidence interval ///////////////////////////////////////////
double Histogram::percentile(const double p) const
{
if ((p < 0.0)||(p > 100.0))
{
LATAN_ERROR(Range, "percentile (" + strFrom(p) + ")"
" is outside the [0, 100] range");
}
// cf. http://en.wikipedia.org/wiki/Percentile
double wPSum, p_i, p_im1, w_i, res = 0.;
bool haveResult;
wPSum = w_[sInd_[0]];
p_i = (100./total_)*wPSum*0.5;
if (p < p_i)
{
res = data_[sInd_[0]];
}
else
{
haveResult = false;
p_im1 = p_i;
for (Index i = 1; i < data_.size(); ++i)
{
w_i = w_[sInd_[i]];
wPSum += w_i;
p_i = (100./total_)*(wPSum-0.5*w_i);
if ((p >= p_im1) and (p < p_i))
{
double d_i = data_[sInd_[i]], d_im1 = data_[sInd_[i-1]];
res = d_im1 + (p-p_im1)/(p_i-p_im1)*(d_i-d_im1);
haveResult = true;
break;
}
}
if (!haveResult)
{
res = data_[sInd_[data_.size()-1]];
}
}
return res;
}
double Histogram::median(void) const
{
return percentile(50.);
}
pair<double, double> Histogram::confidenceInterval(const double nSigma) const
{
pair<double, double> interval, p;
double cl;
cl = gsl_sf_erf(nSigma/sqrt(2.));
p.first = 50.*(1. - cl);
p.second = 50.*(1. + cl);
interval.first = percentile(p.first);
interval.second = percentile(p.second);
return interval;
}

View File

@ -21,8 +21,7 @@
#define Latan_Histogram_hpp_
#include <LatAnalyze/Global.hpp>
#include <gsl/gsl_histogram.h>
#include <gsl/gsl_cdf.h>
#include <LatAnalyze/StatArray.hpp>
BEGIN_LATAN_NAMESPACE
@ -49,19 +48,31 @@ public:
void normalize(const bool n = true);
bool isNormalized(void) const;
// access
Index size(void) const;
double getX(const Index i) const;
double operator[](const Index i) const;
double operator()(const double x) const;
Index size(void) const;
const StatArray<double> & getData(void) const;
const StatArray<double> & getWeight(void) const;
double getX(const Index i) const;
double operator[](const Index i) const;
double operator()(const double x) const;
// percentiles & confidence interval
double percentile(const double p) const;
double median(void) const;
std::pair<double, double> confidenceInterval(const double nSigma) const;
private:
// resize
void resize(const Index nBin);
// histogram calculation
void makeHistogram(void);
// generate sorted indices
void sortIndices(void);
// compute normalization factor
void computeNorm(void);
private:
DVec x_, bin_;
double total_, norm_, xMax_, xMin_;
bool normalize_{false};
StatArray<double> data_, w_;
DVec x_, bin_;
Vec<size_t> sInd_;
double total_, norm_, xMax_, xMin_;
bool normalize_{false};
};
END_LATAN_NAMESPACE