A library for working with phylogenetic and population genetic data.
v0.32.0
tree/common_tree/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 #include "genesis/tree/tree.hpp"
38 
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <stdexcept>
44 #include <vector>
45 
46 #ifdef GENESIS_OPENMP
47 # include <omp.h>
48 #endif
49 
50 namespace genesis {
51 namespace tree {
52 
53 // =================================================================================================
54 // Branch Distance Measures
55 // =================================================================================================
56 
58  Tree const& tree
59 ) {
60  utils::Matrix<double> result( tree.node_count(), tree.node_count(), 0.0 );
61 
62  // We need to keep track of the edges for which we run updates in the iterations below.
63  // Init with root node, as it does not have a proximal edge, and hence is not going to be visited.
64  std::vector<size_t> visited_indices;
65  visited_indices.reserve( tree.node_count() );
66  visited_indices.push_back( tree.root_node().index() );
67 
68  // Go through the tree, and use the preorder traversal to get inner distances towards the root
69  // first, and later use them to calculate the outer ones, getting further and further away
70  // from the root.
71  for( auto it : preorder( tree )) {
72 
73  // We want to visit each edge once: it.edge() gives the edge going towards the root.
74  // Hence, we skip the first iteration, which gives one of the edges of the root that will
75  // be visited later again anyway, and we do not want to visit it twice.
76  if( it.is_first_iteration() ) {
77  continue;
78  }
79 
80  // Get the node away from the root, its parent towards the root,
81  // and the length of the current edge (the one between those two nodes).
82  auto const node_id = it.node().index();
83  auto const upper_id = it.node().primary_link().outer().node().index();
84  auto const br_len = it.edge().data<CommonEdgeData>().branch_length;
85  assert( node_id == it.edge().secondary_node().index() );
86  assert( upper_id == it.edge().primary_node().index() );
87 
88  // Now set the length from the given node to all the ones that we have visited before
89  // in the preorder. We already have their distances, and can use them to calculate the
90  // distances for the given node. This is kind of like dynamic programming combined with
91  // an upper triangle matrix calculation or inductive computation. Fancy fancy.
92  for( auto const& cur_id : visited_indices ) {
93 
94  // We are visiting each node once. The current one, being part of the already visited
95  // nodes, can hence not be the one of the outer preorder loop.
96  assert( cur_id != node_id );
97 
98  // Get the distance between the node currently being updated and the parent node
99  // of the node of the outer preorder loop.
100  auto const upper_br_len = result( upper_id, cur_id );
101  assert( upper_br_len == result( cur_id, upper_id ));
102 
103  // Set the branch length between the current node and the outer preorder node
104  // as the sum of the parent and the current branch length.
105  // That is, we use the distance of the parent node plus the distance from that parent
106  // to the outer preorder node.
107  assert( result( node_id, cur_id ) == 0.0 );
108  assert( result( cur_id, node_id ) == 0.0 );
109  result( node_id, cur_id ) = upper_br_len + br_len;
110  result( cur_id, node_id ) = upper_br_len + br_len;
111  }
112 
113  // Now add the preorder node to the already visited ones, so that it is updated in
114  // the subsequent iterations.
115  visited_indices.push_back( node_id );
116  }
117 
118  return result;
119 }
120 
122  Tree const& tree,
123  TreeNode const* node
124 ) {
125  if (!node) {
126  node = &tree.root_node();
127  }
128 
129  // Store the distance from each node to the given node.
130  auto vec = std::vector<double>(tree.node_count(), -1.0);
131  vec[node->index()] = 0.0;
132 
133  // Calculate the distance vector via levelorder iteration.
134  for( auto it : levelorder( *node ) ) {
135  // Skip the starting node (it is already set to 0).
136  if (it.is_first_iteration()) {
137  continue;
138  }
139 
140  // We do not have the distance of the current node, but the one of its "parent" (the one in
141  // direction of the starting node)!
142  assert(vec[it.node().index()] == -1.0);
143  assert(vec[it.link().outer().node().index()] > -1.0);
144 
145  // The distance is the distance from the "parent" node (the next one in direction towards
146  // the starting node) plus the branch length.
147  vec[it.node().index()]
148  = vec[it.link().outer().node().index()]
149  + it.edge().data<CommonEdgeData>().branch_length;
150  }
151 
152  return vec;
153 }
154 
156  Tree const& tree
157 ) {
158  // Result matrix that will be returned.
159  utils::Matrix<double> mat (tree.edge_count(), tree.edge_count());
160 
161  // For calculating the distance between edges, we use the distances between nodes and for every
162  // pair of edged find the nodes at the ends of the edges that are closest to each other. This
163  // is then the shortest distance between the two edges.
164  // There is probably a way to get this distance via some tree traversal, which would save us
165  // some lookups and calculation of the min, but be more complex and error prone.
166  // For now, this version should be fast enough.
167  auto node_dist_mat = node_branch_length_distance_matrix(tree);
168 
169  #pragma omp parallel for
170  for( size_t i = 0; i < tree.edge_count(); ++i ) {
171  auto const& row_edge = tree.edge_at(i);
172 
173  for( auto const& col_edge : tree.edges() ) {
174 
175  // Set the diagonal element of the matrix. We don't need to compare nodes in this case,
176  // and particularly want to skip the part where we add half the branch lengths to the
177  // distance. After all, the distance between and element and itself should always be 0!
178  if (row_edge.index() == col_edge.index()) {
179  mat(row_edge.index(), row_edge.index()) = 0.0;
180  continue;
181  }
182 
183  // primary-primary case
184  double pp = node_dist_mat(
185  row_edge.primary_node().index(),
186  col_edge.primary_node().index()
187  );
188 
189  // primary-secondary case
190  double ps = node_dist_mat(
191  row_edge.primary_node().index(),
192  col_edge.secondary_node().index()
193  );
194 
195  // secondary-primary case
196  double sp = node_dist_mat(
197  row_edge.secondary_node().index(),
198  col_edge.primary_node().index()
199  );
200 
201  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
202  // (if this ever happens, the tree is broken).
203  double dist = std::min(pp, std::min(ps, sp));
204  assert(dist <= node_dist_mat(
205  row_edge.secondary_node().index(),
206  col_edge.secondary_node().index()
207  ));
208 
209  // Store in matrix, with halves of the branch lengths.
210  mat(row_edge.index(), col_edge.index())
211  = (dist)
212  + ( row_edge.data<CommonEdgeData>().branch_length / 2.0 )
213  + ( col_edge.data<CommonEdgeData>().branch_length / 2.0 )
214  ;
215  }
216  }
217 
218  return mat;
219 }
220 
222  Tree const& tree,
223  TreeEdge const* edge
224 ) {
225  std::vector<double> vec (tree.edge_count());
226 
227  if( edge == nullptr ) {
228  throw std::runtime_error( "Cannot use nullptr edge for distance calulcation." );
229  }
230 
231  // Works similar to edge_branch_length_distance_matrix(). See there for a description of the implementation.
232 
233  // We just need two rows of the distance matrix - let's take the vectors instead for speed.
234  auto p_node_dist = node_branch_length_distance_vector(tree, &edge->primary_node());
235  auto s_node_dist = node_branch_length_distance_vector(tree, &edge->secondary_node());
236 
237  #pragma omp parallel for
238  for( size_t i = 0; i < tree.edge_count(); ++i ) {
239  auto const& col_edge = tree.edge_at(i);
240 
241  if (edge->index() == col_edge.index()) {
242  vec[ edge->index() ] = 0.0;
243  continue;
244  }
245 
246  // primary-primary case
247  double pp = p_node_dist[ col_edge.primary_node().index() ];
248 
249  // primary-secondary case
250  double ps = p_node_dist[ col_edge.secondary_node().index() ];
251 
252  // secondary-primary case
253  double sp = s_node_dist[ col_edge.primary_node().index() ];
254 
255  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
256  // (if this ever happens, the tree is broken).
257  double dist = std::min(pp, std::min(ps, sp));
258  assert(dist <= s_node_dist[ col_edge.secondary_node().index() ]);
259 
260  // Store in vector, with halves of the branch lengths.
261  vec[ col_edge.index() ]
262  = ( dist )
263  + ( edge->data<CommonEdgeData>().branch_length / 2.0 )
264  + ( col_edge.data<CommonEdgeData>().branch_length / 2.0 )
265  ;
266  }
267 
268  return vec;
269 }
270 
271 // =================================================================================================
272 // Complex Distance Methods
273 // =================================================================================================
274 
275 double deepest_distance(Tree const& tree)
276 {
277  double max = 0.0;
278  auto leaf_dist = closest_leaf_distance_vector(tree);
279 
280  for( auto const& e : tree.edges() ) {
281  auto idx_p = e.primary_node().index();
282  auto idx_s = e.secondary_node().index();
283 
284  double d = (leaf_dist[idx_p].second
285  + e.data<CommonEdgeData>().branch_length
286  + leaf_dist[ idx_s ].second) / 2.0;
287 
288  if (d > max) {
289  max = d;
290  }
291  }
292  return max;
293 }
294 
299 template< class Comparator >
300 std::vector<std::pair< TreeNode const*, double >> leaf_distance_vector(
301  Tree const& tree,
302  utils::Matrix<double> const& node_distances,
303  Comparator comp
304 ) {
305  // prepare a result vector with the size of number of nodes.
306  std::vector<std::pair< TreeNode const*, double>> vec;
307  vec.resize(tree.node_count(), {nullptr, 0.0});
308 
309  if( node_distances.rows() != tree.node_count() || node_distances.cols() != tree.node_count() ) {
310  throw std::runtime_error( "Invalid node_branch_length_distance_matrix." );
311  }
312 
313  // fill the vector for every node.
314  // there is probably a faster way of doing this: preorder traversal with pruning. but for now,
315  // this simple O(n^2) version works.
316  for( auto const& node : tree.nodes() ) {
317 
318  // we have not visited this node. assertion holds as long as the indices are correct.
319  assert( vec[ node.index() ].first == nullptr );
320 
321  TreeNode const* res_node = nullptr;
322  double res_dist = 0.0;
323 
324  // try out all other nodes, and find the closest leaf.
325  for( auto const& other : tree.nodes() ) {
326 
327  if(! is_leaf( other )) {
328  continue;
329  }
330 
331  double dist = node_distances( node.index(), other.index() );
332  if( res_node == nullptr || comp( dist, res_dist )) {
333  res_node = &other;
334  res_dist = dist;
335  }
336  }
337 
338  vec[ node.index() ].first = res_node;
339  vec[ node.index() ].second = res_dist;
340  }
341 
342  return vec;
343 }
344 
345 std::vector<std::pair< TreeNode const*, double >> closest_leaf_distance_vector(
346  Tree const& tree
347 ) {
348  // we need the pairwise distances between all nodes, so we can do quick lookups.
349  auto node_distances = node_branch_length_distance_matrix(tree);
350 
351  return leaf_distance_vector( tree, node_distances, std::less<double>() );
352 }
353 
354 std::vector<std::pair< TreeNode const*, double>> closest_leaf_distance_vector(
355  Tree const& tree,
356  utils::Matrix<double> const& node_distances
357 ) {
358  return leaf_distance_vector( tree, node_distances, std::less<double>() );
359 }
360 
361 std::vector<std::pair< TreeNode const*, double>> furthest_leaf_distance_vector(
362  Tree const& tree
363 ) {
364  // we need the pairwise distances between all nodes, so we can do quick lookups.
365  auto node_distances = node_branch_length_distance_matrix(tree);
366 
367  return leaf_distance_vector( tree, node_distances, std::greater<double>() );
368 }
369 
370 std::vector<std::pair< TreeNode const*, double>> furthest_leaf_distance_vector(
371  Tree const& tree,
372  utils::Matrix<double> const& node_distances
373 ) {
374  return leaf_distance_vector( tree, node_distances, std::greater<double>() );
375 }
376 
377 } // namespace tree
378 } // namespace genesis
genesis::tree::Tree::node_count
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
genesis::utils::Matrix::cols
size_t cols() const
Definition: containers/matrix.hpp:181
genesis::tree::furthest_leaf_distance_vector
std::vector< std::pair< TreeNode const *, double > > furthest_leaf_distance_vector(Tree const &tree)
Opposite of closest_leaf_distance_vector().
Definition: tree/common_tree/distances.cpp:361
genesis::tree::node_branch_length_distance_vector
std::vector< double > node_branch_length_distance_vector(Tree const &tree, TreeNode const *node)
Return a vector containing the distance of all nodes with respect to the given start node,...
Definition: tree/common_tree/distances.cpp:121
genesis::tree::leaf_distance_vector
std::vector< std::pair< TreeNode const *, double > > leaf_distance_vector(Tree const &tree, utils::Matrix< double > const &node_distances, Comparator comp)
Local helper function to calculate either closest_leaf_distance_vector() or furthest_leaf_distance_ve...
Definition: tree/common_tree/distances.cpp:300
tree.hpp
Header of Tree class.
genesis::tree::TreeEdge::index
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:106
functions.hpp
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
genesis::tree::preorder
utils::Range< IteratorPreorder< true > > preorder(ElementType const &element)
Definition: tree/iterator/preorder.hpp:264
genesis::tree::Tree::root_node
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.hpp:300
genesis::tree::levelorder
utils::Range< IteratorLevelorder< true > > levelorder(ElementType const &element)
Definition: tree/iterator/levelorder.hpp:260
genesis::tree::TreeEdge::secondary_node
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.hpp:162
genesis::utils::Matrix< double >
distances.hpp
Header of CommonTree distance methods.
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::tree::CommonEdgeData::branch_length
double branch_length
Branch length of the edge.
Definition: tree/common_tree/tree.hpp:193
genesis::tree::deepest_distance
double deepest_distance(Tree const &tree)
Return the longest distance from any point in the tree (on the edges) to any leaf.
Definition: tree/common_tree/distances.cpp:275
genesis::tree::TreeEdge
Definition: edge.hpp:60
genesis::tree::TreeNode
Definition: tree/tree/node.hpp:58
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::TreeEdge::primary_node
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.hpp:146
genesis::tree::Tree::nodes
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
genesis::tree::Tree::edges
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
genesis::tree::CommonEdgeData
Common class containing the commonly needed data for tree edges.
Definition: tree/common_tree/tree.hpp:144
operators.hpp
Matrix operators.
genesis::tree::node_branch_length_distance_matrix
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
Definition: tree/common_tree/distances.cpp:57
genesis::tree::TreeNode::index
size_t index() const
Return the index of this Node.
Definition: tree/tree/node.hpp:102
genesis::tree::edge_branch_length_distance_matrix
utils::Matrix< double > edge_branch_length_distance_matrix(Tree const &tree)
Definition: tree/common_tree/distances.cpp:155
genesis::tree::TreeEdge::data
EdgeDataType & data()
Definition: edge.hpp:217
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
genesis::utils::Matrix::rows
size_t rows() const
Definition: containers/matrix.hpp:176
levelorder.hpp
genesis::tree::closest_leaf_distance_vector
std::vector< std::pair< TreeNode const *, double > > closest_leaf_distance_vector(Tree const &tree)
Return a vector containing the closest leaf node for each node, using the branch_length as distance m...
Definition: tree/common_tree/distances.cpp:345
lca_lookup.hpp
genesis::tree::edge_branch_length_distance_vector
std::vector< double > edge_branch_length_distance_vector(Tree const &tree, TreeEdge const *edge)
Definition: tree/common_tree/distances.cpp:221
preorder.hpp
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272