A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utils/math/histogram/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include <assert.h>
34 #include <cmath>
35 #include <stdexcept>
36 #include <vector>
37 
40 
41 namespace genesis {
42 namespace utils {
43 
44 // =================================================================================================
45 // Histogram Distances
46 // =================================================================================================
47 
48 double earth_movers_distance (const Histogram& h1, const Histogram& h2, bool norm)
49 {
50  // Local helper function for the actual workload.
51  auto earth_movers_distance_simple = [] ( Histogram const& h1, Histogram const& h2) {
52  // We are calcuating the EMD linarly:
53  //
54  // EMD[0] = 0
55  // EMD[i+1] = ( h1[i] + EMD[i] ) - h2[i]
56  // result = SUM (| EMD[i] |)
57  //
58  // It would probably suffice to keep only the last two values of the emd array,
59  // but for now this approach is easier to test.
60 
61  // There are approaches like bin-mapping that also allow emd on histograms with different
62  // numbers of bins (but same min and max value). There might even be something that can compare
63  // all kinds of histograms with each other. So far, we do not need those, so we expect our
64  // histograms to have equal ranges.
65  if (!equal_ranges(h1, h2)) {
66  throw std::range_error("__FUNCTION__: Histograms do not have equal ranges.");
67  }
68 
69  std::vector<double> emd (h1.bins(), 0.0);
70  double result = 0.0;
71  double delta;
72 
73  for (size_t i = 0; i < h1.bins() - 1; ++i) {
74  emd[i + 1] = h1[i] + emd[i] - h2[i];
75  delta = h1.bin_midpoint(i + 1) - h1.bin_midpoint(i);
76 
77  // If delta is < 0, our Histograms have non-sorted ranges. Should not happen.
78  assert(delta >= 0);
79 
80  result += std::abs( emd[i + 1] ) * delta;
81  }
82 
83  return result;
84  };
85 
86  if (norm) {
87  auto hn1 = h1;
88  auto hn2 = h2;
89 
90  normalize(hn1);
91  normalize(hn2);
92 
93  return earth_movers_distance_simple( hn1, hn2 );
94  } else {
95  return earth_movers_distance_simple( h1, h2 );
96  }
97 }
98 
99 } // namespace utils
100 } // namespace genesis
size_t bins() const
Definition: histogram.cpp:244
Header of Histogram class.
Histogram class for accumulating and summarizing data.
Definition: histogram.hpp:68
bool equal_ranges(Histogram const &lhs, Histogram const &rhs)
Definition: histogram.cpp:55
Header of Histogram operations functions.
double bin_midpoint(size_t bin_num) const
Definition: histogram.cpp:254
double earth_movers_distance(const Histogram &h1, const Histogram &h2, bool norm)
void normalize(Histogram &h, double total)
Definition: operations.cpp:61
Header of Histogram distance functions.