A library for working with phylogenetic and population genetic data.
v0.27.0
utils/math/histogram/stats.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <algorithm>
34 #include <cassert>
35 #include <cmath>
36 #include <iterator>
37 #include <numeric>
38 
40 
41 namespace genesis {
42 namespace utils {
43 
44 // =================================================================================================
45 // Histogram Statistics
46 // =================================================================================================
47 
48 double min_value(const Histogram& h)
49 {
50  return *std::min_element(h.begin(), h.end());
51 }
52 
53 double max_value(const Histogram& h)
54 {
55  return *std::max_element(h.begin(), h.end());
56 }
57 
58 size_t min_bin(const Histogram& h)
59 {
60  return std::distance(h.begin(), std::min_element(h.begin(), h.end()));
61 }
62 
63 size_t max_bin(const Histogram& h)
64 {
65  return std::distance(h.begin(), std::max_element(h.begin(), h.end()));
66 }
67 
68 double median(const Histogram& h)
69 {
70  const double mid = sum(h) / 2;
71  double part = 0.0;
72 
73  size_t i;
74  for (i = 0; i < h.bins(); ++i) {
75  if (part + h[i] < mid) {
76  part += h[i];
77  } else {
78  break;
79  }
80  }
81  assert(part < mid && i < h.bins());
82 
83  // Find the relative position of mid within the interval [part, part + bin[i]).
84  // This determines where exactly our median is.
85  const double pos = (mid - part) / h[i];
86 
87  // Now find this position within the range of the bin and return it.
88  return h.bin_range(i).first + pos * h.bin_width(i);
89 }
90 
91 double mean(const Histogram& h)
92 {
93  // We are using the recurrence relation
94  // M(n) = M(n-1) + (x[n] - M(n-1)) (w(n)/(W(n-1) + w(n)))
95  // W(n) = W(n-1) + w(n)
96  // which is also used in the GNU Scientific Library.
97 
98  double wmean = 0.0;
99  double weight = 0.0;
100 
101  for (size_t i = 0; i < h.bins(); ++i) {
102  const double xi = h.bin_midpoint(i);
103  const double wi = h[i];
104 
105  if (wi > 0.0) {
106  weight += wi;
107  wmean += (xi - wmean) * (wi / weight);
108  }
109  }
110 
111  return wmean;
112 }
113 
114 double sigma(const Histogram& h)
115 {
116  // We use the same method as in the GNU Scientific Library.
117  // It is a two-pass algorithm for stability.
118  // Could also use a single pass formula, as given in
119  // N. J. Higham: "Accuracy and Stability of Numerical Methods", p.12
120 
121  const double wmean = mean(h);
122  double weight = 0.0;
123  double wvar = 0.0;
124 
125  for (size_t i = 0; i < h.bins(); ++i) {
126  const double xi = h.bin_midpoint(i);
127  const double wi = h[i];
128 
129  if (wi > 0.0) {
130  const double delta = (xi - wmean);
131 
132  weight += wi;
133  wvar += (delta * delta - wvar) * (wi / weight);
134  }
135  }
136 
137  return std::sqrt(wvar);
138 }
139 
140 double sum(const Histogram& h)
141 {
142  return std::accumulate (h.begin(), h.end(), 0.0);
143 }
144 
145 } // namespace utils
146 } // namespace genesis
genesis::utils::Histogram::bin_midpoint
double bin_midpoint(size_t bin_num) const
Definition: histogram.cpp:254
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::utils::sigma
double sigma(const Histogram &h)
Compute the bin-weighted standard deviation.
Definition: utils/math/histogram/stats.cpp:114
histogram.hpp
Header of Histogram class.
genesis::utils::min_bin
size_t min_bin(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:58
genesis::utils::Histogram::end
iterator end()
Definition: histogram.cpp:215
genesis::utils::Histogram::bin_width
double bin_width(size_t bin_num) const
Definition: histogram.cpp:259
genesis::utils::max_value
double max_value(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:53
genesis::utils::max_bin
size_t max_bin(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:63
genesis::utils::Histogram::bin_range
std::pair< double, double > bin_range(size_t bin_num) const
Definition: histogram.cpp:249
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
stats.hpp
Header of Histogram statistics functions.
genesis::utils::Histogram::bins
size_t bins() const
Definition: histogram.cpp:244
genesis::utils::Histogram::begin
iterator begin()
Definition: histogram.cpp:210
genesis::utils::mean
double mean(const Histogram &h)
Compute the bin-weighted arithmetic mean.
Definition: utils/math/histogram/stats.cpp:91
genesis::utils::min_value
double min_value(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:48
genesis::utils::Histogram
Histogram class for accumulating and summarizing data.
Definition: histogram.hpp:68
genesis::utils::median
double median(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:68