A library for working with phylogenetic and population genetic data.
v0.27.0
utils/math/histogram/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <cassert>
34 #include <cmath>
35 #include <stdexcept>
36 #include <vector>
37 
40 
41 namespace genesis {
42 namespace utils {
43 
44 // =================================================================================================
45 // Histogram Distances
46 // =================================================================================================
47 
48 double earth_movers_distance (const Histogram& h1, const Histogram& h2, bool norm)
49 {
50  // Local helper function for the actual workload.
51  auto earth_movers_distance_simple = [] ( Histogram const& h1, Histogram const& h2) {
52  // We are calcuating the EMD linarly:
53  //
54  // EMD[0] = 0
55  // EMD[i+1] = ( h1[i] + EMD[i] ) - h2[i]
56  // result = SUM (| EMD[i] |)
57  //
58  // It would probably suffice to keep only the last two values of the emd array,
59  // but for now this approach is easier to test.
60 
61  // There are approaches like bin-mapping that also allow emd on histograms with different
62  // numbers of bins (but same min and max value). There might even be something that can compare
63  // all kinds of histograms with each other. So far, we do not need those, so we expect our
64  // histograms to have equal ranges.
65  if (!equal_ranges(h1, h2)) {
66  throw std::range_error("__FUNCTION__: Histograms do not have equal ranges.");
67  }
68 
69  std::vector<double> emd (h1.bins(), 0.0);
70  double result = 0.0;
71 
72  for (size_t i = 0; i < h1.bins() - 1; ++i) {
73  emd[i + 1] = h1[i] + emd[i] - h2[i];
74  double const delta = h1.bin_midpoint(i + 1) - h1.bin_midpoint(i);
75 
76  // If delta is < 0, our Histograms have non-sorted ranges. Should not happen.
77  assert(delta >= 0);
78 
79  result += std::abs( emd[i + 1] ) * delta;
80  }
81 
82  return result;
83  };
84 
85  if (norm) {
86  auto hn1 = h1;
87  auto hn2 = h2;
88 
89  normalize(hn1);
90  normalize(hn2);
91 
92  return earth_movers_distance_simple( hn1, hn2 );
93  } else {
94  return earth_movers_distance_simple( h1, h2 );
95  }
96 }
97 
98 } // namespace utils
99 } // namespace genesis
genesis::utils::Histogram::bin_midpoint
double bin_midpoint(size_t bin_num) const
Definition: histogram.cpp:254
genesis::utils::equal_ranges
bool equal_ranges(Histogram const &lhs, Histogram const &rhs)
Definition: histogram.cpp:55
histogram.hpp
Header of Histogram class.
operations.hpp
Header of Histogram operations functions.
distances.hpp
Header of Histogram distance functions.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::Histogram::bins
size_t bins() const
Definition: histogram.cpp:244
genesis::utils::earth_movers_distance
double earth_movers_distance(const Histogram &h1, const Histogram &h2, bool norm)
Definition: utils/math/histogram/distances.cpp:48
genesis::utils::normalize
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
genesis::utils::Histogram
Histogram class for accumulating and summarizing data.
Definition: histogram.hpp:68