A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/mass_tree/emd.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cmath>
45 #include <stdexcept>
46 #include <utility>
47 #include <vector>
48 
49 #ifdef GENESIS_OPENMP
50 # include <omp.h>
51 #endif
52 
53 namespace genesis {
54 namespace tree {
55 
56 // =================================================================================================
57 // Earth Movers Distance
58 // =================================================================================================
59 
60 double earth_movers_distance( MassTree const& lhs, MassTree const& rhs, double const p )
61 {
62  // Check.
63  if( p <= 0.0 ) {
64  throw std::runtime_error(
65  "Invalid exponent value p for earth mover's distance calculation. Has to be > 0.0."
66  );
67  }
68 
69  // We make an extra copy, which is super expensive. In our our code, thus better do not
70  // use this function, but the one-tree version directly!
71  // auto copy = lhs;
72  // mass_tree_reverse_signs( copy );
73  // mass_tree_merge_trees_inplace( copy, rhs );
74  // return earth_movers_distance( copy, p ).first;
75 
76  // We don't do a full check for compatile topologies, but at least this check is cheap.
77  if( lhs.edge_count() != rhs.edge_count() ) {
78  throw std::invalid_argument( "MassTrees need to have same size." );
79  }
80 
81  // Keep track of the total resulting work (the distance we moved the masses).
82  // This is the result returned in the end.
83  double work = 0.0;
84 
85  // Store a list of masses for each processed node. It maps from node indices to the total
86  // mass that comes from the subtree below that node. Thus, for the root node, it should be
87  // the same value as sum_of_masses(). Both values should be close to zero (except for numerical
88  // issues), in order for the result of this function to be meaningful.
89  auto node_masses = std::vector<double>( lhs.node_count(), 0.0 );
90 
91  // Do a postorder traversal over both trees in parallel, starting at the root.
92  // In theory, it does not matter where we start the traversal - however, the positions of the
93  // masses are given as "proximal_length" on their branch, which always points away from the
94  // root. Thus, if we decided to traverse from a different node than the root, we would have to
95  // take this into account. So, we do start at the root, to keep it simple.
96  auto lhs_it = postorder( lhs ).begin();
97  auto rhs_it = postorder( rhs ).begin();
98  auto lhs_end = postorder( lhs ).end();
99  auto rhs_end = postorder( rhs ).end();
100  for( ; lhs_it != lhs_end && rhs_it != rhs_end ; ++lhs_it, ++rhs_it ) {
101 
102  // If we are at the last iteration, we reached the root. Thus, we have moved all masses
103  // and don't need to proceed. If we did, we would count an edge of the root again
104  // (because the iterator traverses nodes, not edges, so the root node itself is traversed,
105  // although it has no proper edge that we would need to process).
106  if( lhs_it.is_last_iteration() || rhs_it.is_last_iteration() ) {
107  // If one iterator is at the end, but not the other, something is wrong.
108  if( ! lhs_it.is_last_iteration() || ! rhs_it.is_last_iteration() ) {
109  throw std::invalid_argument( "Incompatible MassTrees." );
110  }
111  continue;
112  }
113 
114  // Some shorthands.
115  const size_t pri_node_index = lhs_it.edge().primary_node().index();
116  const size_t sec_node_index = lhs_it.edge().secondary_node().index();
117 
118  // More checks.
119  if(
120  pri_node_index != rhs_it.edge().primary_node().index() ||
121  sec_node_index != rhs_it.edge().secondary_node().index()
122  ) {
123  throw std::invalid_argument( "Incompatible MassTrees." );
124  }
125 
126  // The iterator should guarantee that its edge is always the one pointing towards the root.
127  // Still, better check this!
128  assert( sec_node_index == lhs_it.node().index() );
129  assert( sec_node_index == rhs_it.node().index() );
130 
131  // Add both masses to a common map, one of them with negative sign.
132  // This is faster than merging into a vector, and easier than doing a parallel iteration
133  // over the values in sorted order.
134  auto edge_masses = lhs_it.edge().data<MassTreeEdgeData>().masses;
135  for( auto const& mass : rhs_it.edge().data<MassTreeEdgeData>().masses ) {
136  edge_masses[ mass.first ] -= mass.second;
137  }
138 
139  // We now start a "normal" earth movers distance caluclation along the current edge.
140  // We start at the end of the branch, with the mass that comes from the subtree below it...
141  double current_pos = std::max(
142  lhs_it.edge().data<MassTreeEdgeData>().branch_length,
143  rhs_it.edge().data<MassTreeEdgeData>().branch_length
144  );
145  double current_mass = node_masses[ sec_node_index ];
146 
147  // ... and move the mass along the branch, balancing it with the masses found on the branch.
148  // We use a reverse iterator in order to traverse the branch from end to start.
149  for(
150  auto mass_rit = edge_masses.crbegin();
151  mass_rit != edge_masses.crend();
152  ++mass_rit
153  ) {
154  // The work is accumulated: The mass that we are currently moving times the distances
155  // that we move it.
156  work += std::pow( std::abs( current_mass ), p ) * ( current_pos - mass_rit->first );
157 
158  // Update the current position and mass.
159  current_pos = mass_rit->first;
160  current_mass += mass_rit->second;
161  }
162 
163  // After we finished moving along the branch, we need extra work to move the remaining mass
164  // to the node at the top end of the branch. Also, add the remaining mass to this node, so
165  // that it is available for when we process the upper part of that node (towards the root).
166  work += std::pow( std::abs( current_mass ), p ) * current_pos;
167  node_masses[ pri_node_index ] += current_mass;
168  }
169 
170  // Now we need to be done with both trees, otherwise we have a problem.
171  if( lhs_it != lhs_end || rhs_it != rhs_end ) {
172  throw std::invalid_argument( "Incompatible MassTrees." );
173  }
174 
175  // Apply the outer exponent.
176  if( p > 1.0 ) {
177  work = std::pow( work, 1.0 / p );
178  }
179 
180  return work;
181 }
182 
183 utils::Matrix<double> earth_movers_distance( std::vector<MassTree> const& trees, double const p )
184 {
185  // Check.
186  if( p <= 0.0 ) {
187  throw std::runtime_error(
188  "Invalid exponent value p for earth mover's distance calculation. Has to be > 0.0."
189  );
190  }
191 
192  // Init result matrix.
193  auto result = utils::Matrix<double>( trees.size(), trees.size(), 0.0 );
194 
195  // Parallel specialized code.
196  #ifdef GENESIS_OPENMP
197 
198  // We only need to calculate the upper triangle. Get the number of indices needed
199  // to describe this triangle.
200  size_t const max_k = utils::triangular_size( trees.size() );
201 
202  // Use dynamic parallelization, as trees might be of different size (in terms of number
203  // of mass points).
204  #pragma omp parallel for schedule( dynamic )
205  for( size_t k = 0; k < max_k; ++k ) {
206 
207  // For the given linear index, get the actual position in the Matrix.
208  auto const ij = utils::triangular_indices( k, trees.size() );
209  auto const i = ij.first;
210  auto const j = ij.second;
211 
212  // Calculate EMD and fill symmetric Matrix.
213  auto const emd = earth_movers_distance( trees[i], trees[j], p );
214  result( i, j ) = emd;
215  result( j, i ) = emd;
216  }
217 
218  // If no threads are available at all, use serial version.
219  #else
220 
221  for( size_t i = 0; i < trees.size(); ++i ) {
222 
223  // The result is symmetric - we only calculate the upper triangle.
224  for( size_t j = i + 1; j < trees.size(); ++j ) {
225 
226  auto const emd = earth_movers_distance( trees[i], trees[j], p );
227  result( i, j ) = emd;
228  result( j, i ) = emd;
229  }
230  }
231 
232  #endif
233 
234  return result;
235 }
236 
237 std::pair<double, double> earth_movers_distance( MassTree const& tree, double const p )
238 {
239  // Check.
240  if( p <= 0.0 ) {
241  throw std::runtime_error(
242  "Invalid exponent value p for earth mover's distance calculation. Has to be > 0.0."
243  );
244  }
245 
246  // Keep track of the total resulting work (the distance we moved the masses).
247  // This is the result returned in the end.
248  double work = 0.0;
249 
250  // Store a list of masses for each processed node. It maps from node indices to the total
251  // mass that comes from the subtree below that node. Thus, for the root node, it should be
252  // the same value as sum_of_masses(). Both values should be close to zero (except for numerical
253  // issues), in order for the result of this function to be meaningful.
254  auto node_masses = std::vector<double>( tree.node_count(), 0.0 );
255 
256  // Do a postorder traversal of the tree, starting at the root.
257  // In theory, it does not matter where we start the traversal - however, the positions of the
258  // masses are given as "proximal_length" on their branch, which always points away from the
259  // root. Thus, if we decided to traverse from a different node than the root, we would have to
260  // take this into account. So, we do start at the root, to keep it simple.
261  for( auto tree_it : postorder(tree) ) {
262 
263  // If we are at the last iteration, we reached the root. Thus, we have moved all masses
264  // and don't need to proceed. If we did, we would count an edge of the root again
265  // (because the iterator traverses nodes, not edges, so the root node itself is traversed,
266  // although it has no proper edge that we would need to process).
267  if( tree_it.is_last_iteration() ) {
268  continue;
269  }
270 
271  // Some shorthands.
272  const size_t pri_node_index = tree_it.edge().primary_node().index();
273  const size_t sec_node_index = tree_it.edge().secondary_node().index();
274 
275  // The iterator should guarantee that its edge is always the one pointing towards the root.
276  // Still, better check this!
277  assert( sec_node_index == tree_it.node().index() );
278 
279  // We now start a "normal" earth movers distance caluclation along the current edge.
280  // We start at the end of the branch, with the mass that comes from the subtree below it...
281  double current_pos = tree_it.edge().data<MassTreeEdgeData>().branch_length;
282  double current_mass = node_masses[ sec_node_index ];
283 
284  // ... and move the mass along the branch, balancing it with the masses found on the branch.
285  // We use a reverse iterator in order to traverse the branch from end to start.
286  auto const& edge_masses = tree_it.edge().data<MassTreeEdgeData>().masses;
287  for(
288  auto mass_rit = edge_masses.crbegin();
289  mass_rit != edge_masses.crend();
290  ++mass_rit
291  ) {
292  // The work is accumulated: The mass that we are currently moving times the distances
293  // that we move it, taking the exponent into account.
294  work += std::pow( std::abs( current_mass ), p ) * ( current_pos - mass_rit->first );
295 
296  // Update the current position and mass.
297  current_pos = mass_rit->first;
298  current_mass += mass_rit->second;
299  }
300 
301  // After we finished moving along the branch, we need extra work to move the remaining mass
302  // to the node at the top end of the branch. Also, add the remaining mass to this node, so
303  // that it is available for when we process the upper part of that node (towards the root).
304  // Here again we need to take the exponent into account.
305  work += std::pow( std::abs( current_mass ), p ) * current_pos;
306  node_masses[ pri_node_index ] += current_mass;
307  }
308 
309  // Apply the outer exponent.
310  if( p > 1.0 ) {
311  work = std::pow( work, 1.0 / p );
312  }
313 
314  // Finally, return the needed work, and the mass at the root, as a way of correctness checking.
315  return { work, node_masses[ tree.root_node().index() ] };
316 }
317 
318 } // namespace tree
319 } // namespace genesis
Tree operator functions.
Data class for MassTreeEdges. Stores the branch length and a list of masses with their positions alon...
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
Provides easy and fast logging functionality.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
Matrix operators.
Header of Tree class.
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
double earth_movers_distance(MassTree const &lhs, MassTree const &rhs, double const p)
Calculate the earth mover's distance of two distributions of masses on a given Tree.