A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/function/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
39 
40 #include <algorithm>
41 #include <cassert>
42 #include <functional>
43 #include <unordered_set>
44 #include <vector>
45 
46 #ifdef GENESIS_OPENMP
47 # include <omp.h>
48 #endif
49 
50 namespace genesis {
51 namespace tree {
52 
53 // =================================================================================================
54 // Node Count Properties
55 // =================================================================================================
56 
57 size_t max_rank( Tree const& tree )
58 {
59  size_t max = 0;
60  for( size_t i = 0; i < tree.node_count(); ++i ) {
61  max = std::max( max, tree.node_at(i).rank() );
62  }
63  return max;
64 }
65 
66 bool is_bifurcating( Tree const& tree )
67 {
68  return max_rank( tree ) == 2;
69 }
70 
71 size_t leaf_node_count( Tree const& tree )
72 {
73  size_t sum = 0;
74  for (size_t i = 0; i < tree.node_count(); ++i) {
75  auto const& n = tree.node_at(i);
76  if (n.is_leaf()) {
77  ++sum;
78  }
79  }
80  return sum;
81 }
82 
83 size_t inner_node_count( Tree const& tree )
84 {
85  return tree.node_count() - leaf_node_count( tree );
86 }
87 
88 size_t node_count( Tree const& tree )
89 {
90  return tree.node_count();
91 }
92 
93 size_t leaf_edge_count( Tree const& tree )
94 {
95  size_t sum = 0;
96  for( auto const& edge : tree.edges() ) {
97  if( edge->primary_node().is_leaf() || edge->secondary_node().is_leaf() ) {
98  ++sum;
99  }
100  }
101  return sum;
102 }
103 
104 size_t inner_edge_count( Tree const& tree )
105 {
106  size_t sum = 0;
107  for( auto const& edge : tree.edges() ) {
108  if( edge->primary_node().is_inner() && edge->secondary_node().is_inner() ) {
109  ++sum;
110  }
111  }
112  return sum;
113 }
114 
115 size_t edge_count( Tree const& tree )
116 {
117  return tree.edge_count();
118 }
119 
120 std::vector<size_t> inner_edge_indices( Tree const& tree )
121 {
122  std::vector<size_t> result;
123  for( auto const& edge_it : tree.edges() ) {
124  if( edge_it->secondary_node().is_inner() ) {
125  result.push_back( edge_it->index() );
126  }
127  }
128  return result;
129 }
130 
131 std::vector<size_t> leaf_edge_indices( Tree const& tree )
132 {
133  std::vector<size_t> result;
134  for( auto const& edge_it : tree.edges() ) {
135  if( edge_it->secondary_node().is_leaf() ) {
136  result.push_back( edge_it->index() );
137  }
138  }
139  return result;
140 }
141 
142 std::vector<size_t> inner_node_indices( Tree const& tree )
143 {
144  std::vector<size_t> result;
145  for( auto const& node_it : tree.nodes() ) {
146  if( node_it->is_inner() ) {
147  result.push_back( node_it->index() );
148  }
149  }
150  return result;
151 }
152 
153 std::vector<size_t> leaf_node_indices( Tree const& tree )
154 {
155  std::vector<size_t> result;
156  for( auto const& node_it : tree.nodes() ) {
157  if( node_it->is_leaf() ) {
158  result.push_back( node_it->index() );
159  }
160  }
161  return result;
162 }
163 
164 // =================================================================================================
165 // Tree Sides
166 // =================================================================================================
167 
169 {
170  // Get a quadratic matrix: for each edge, it gives a value whether each other edge is
171  // proximal (1) or distal (-1) relative to itself (0).
172  auto result = utils::Matrix<signed char>( tree.edge_count(), tree.edge_count(), 0 );
173 
174  // Helper function that traverses the subtree starting at a link,
175  // and for each edge in the subtree, sets its entry in the matrix to the given sign.
176  auto traverse = [&]( TreeLink const& start_link, size_t i, signed char sign ){
177  auto link = &( start_link.next() );
178  while( link != &start_link ) {
179  result( i, link->edge().index() ) = sign;
180  link = &( link->outer().next() );
181  }
182  };
183 
184  // For each edge, do the traversal in both directions and set the signs.
185  // This can probably be done more efficiently with only one smart traversal of the whole tree,
186  // but this function is not needed often enough right now.
187  for( size_t i = 0; i < tree.edge_count(); ++i ) {
188  traverse( tree.edge_at(i).primary_link(), i, -1 );
189  traverse( tree.edge_at(i).secondary_link(), i, 1 );
190  }
191 
192  return result;
193 }
194 
196 {
197  auto mat = utils::Matrix<signed char>( tree.node_count(), tree.node_count(), 0 );
198 
199  // Fill every row of the matrix.
200  #pragma omp parallel for
201  for( size_t i = 0; i < tree.node_count(); ++i ) {
202  auto const& row_node = tree.node_at(i);
203  auto const row_index = row_node.index();
204  auto const primary_link = &row_node.primary_link();
205 
206  // Fill root side subtree with 1s.
207  // We do set the value of inner nodes multiple times, but that's no problem.
208  // Also, we need to do an extra check for the root here, in order to set all
209  // subtrees of the root to -1.
210  signed char const value = row_node.is_root() ? -1 : 1;
211  auto current_link = &( primary_link->outer() );
212  while( current_link != primary_link ) {
213  mat( row_index, current_link->node().index() ) = value;
214  current_link = &( current_link->next().outer() );
215  }
216 
217  // Fill all non-root side subtrees with -1s.
218  // We explicitly go through all non-root links of the node, in order to be really clear
219  // about our intentions. It would also work to simply follow the link chain until
220  // we reach the primary link again. However, this would also set -1 to our row node,
221  // thus we'd have to reset it, making the algorithm a bit messy.
222  // So, to be clear and clean, we avoid this.
223  auto sub_link = &( primary_link->next() );
224  while( sub_link != primary_link ) {
225 
226  // Now, for a given non-root subtree, set everything to -1.
227  current_link = &( sub_link->outer() );
228  while( current_link != sub_link ) {
229  mat( row_index, current_link->node().index() ) = -1;
230  current_link = &( current_link->next().outer() );
231  }
232 
233  // Go to next subtree.
234  sub_link = &( sub_link->next() );
235  }
236 
237  // Check that the diagonal element is untouched.
238  assert( mat( row_index, row_index ) == 0 );
239  }
240 
241  return mat;
242 }
243 
244 // =================================================================================================
245 // Subtrees
246 // =================================================================================================
247 
248 size_t subtree_size( Tree const& tree, TreeLink const& link )
249 {
250  if( ! belongs_to( tree, link )) {
251  throw std::runtime_error(
252  "Cannot caluclate subtree_size, as the given Link does not belong to the Tree."
253  );
254  }
255 
256  // TODO This is a quick and dirty solution. Traverse the whole subtree, add all nodes to a set
257  // and simply return the size of that set.
258 
259  std::unordered_set< TreeNode const* > visited_nodes;
260 
261  auto cur_link = &link.outer();
262  while( cur_link != &link ) {
263  visited_nodes.insert( &cur_link->node() );
264  cur_link = &cur_link->next().outer();
265  }
266 
267  return visited_nodes.size();
268 }
269 
270 std::vector<size_t> subtree_sizes( Tree const& tree, TreeNode const& node )
271 {
272  if( ! belongs_to( tree, node )) {
273  throw std::runtime_error(
274  "Cannot calculate subtree_sizes(), as the given Node does not belong to the Tree."
275  );
276  }
277 
278  // TODO this is an overly complex and thus error prone solution, maybe there is a better way?!
279 
280  // Prepare result vector.
281  std::vector<size_t> result;
282  result.resize( tree.node_count(), 0 );
283 
284  // We use a stack to track the subtree sizes.
285  // We store the entry link of the preorder traversal of the nodes. The entry link is the one
286  // that is given when visiting the node first while doing a eulertour traversal of the tree.
287  // This is always the next() link after the towards-the-starting-node/root link.
288  std::vector< TreeLink const* > stack;
289  stack.push_back( &node.link() );
290 
291  // Traverse the tree.
292  for( auto const& it : eulertour( node )) {
293 
294  // If this is the last time we visit that node on our way back up the tree.
295  // (The second part of the condition checks whether it is the starting node, because
296  // in this case, we do not want to remove it.)
297  if( &it.link().next() == stack.back() && stack.back() != &node.link() ) {
298 
299  // We finished with a subtree. Add the cummulative number of children of that subtree
300  // to the parent node, and remove the parent from the stack (as we are done with it).
301  auto st_size = result[ stack.back()->node().index() ];
302  stack.pop_back();
303  result[ stack.back()->node().index() ] += st_size;
304 
305  // If this node is already the current top stack element.
306  } else if( &it.node() == &stack.back()->node() ) {
307 
308  // Do nothing.
309 
310  // If it is a leaf.
311  } else if( it.link().is_leaf() ) {
312 
313  // Simply increment its parent's counter.
314  ++result[ stack.back()->node().index() ];
315 
316  // If we will visit that node in the future again.
317  } else {
318 
319  // Add a count for the immediate child (i.e., the current node) to the current stack
320  // end (i.e., increment the counter of children of that node),
321  // then add the current node itself to the stack, so that in the next iteration,
322  // we will increase its counts.
323  ++result[ stack.back()->node().index() ];
324  stack.push_back( &it.link() );
325  }
326  }
327 
328  // The stack now should contain only a single node, which is the starting node itself.
329  assert( stack.size() == 1 && stack.back() == &node.link() );
330 
331  // The size of the subtree of the starting node is always the number of nodes in the tree
332  // minus one for that node itself (as it is not counted as part of its subtree).
333  assert( result[ node.index() ] = tree.node_count() - 1 );
334 
335  return result;
336 }
337 
338 std::vector<size_t> subtree_sizes( Tree const& tree )
339 {
340  return subtree_sizes( tree, tree.root_node() );
341 }
342 
343 size_t subtree_max_path_height( Tree const& tree, TreeLink const& link )
344 {
345  if( ! belongs_to( tree, link )) {
346  throw std::runtime_error(
347  "Cannot calculate subtree_max_path_height(), "
348  "as the given Link does not belong to the Tree."
349  );
350  }
351 
352  // TODO make more efficient. no need for full dist vector.
353  auto dists = node_path_length_vector( tree, link.outer().node() );
354  size_t max = 0;
355 
356  auto cur_link = &link.outer();
357  while( cur_link != &link ) {
358  max = std::max( max, dists[ cur_link->node().index() ] );
359  cur_link = &cur_link->next().outer();
360  }
361  return max;
362 }
363 
364 std::vector<size_t> subtree_max_path_heights( Tree const& tree, TreeNode const& node )
365 {
366  if( ! belongs_to( tree, node )) {
367  throw std::runtime_error(
368  "Cannot calculate subtree_max_path_heights(), "
369  "as the given Node does not belong to the Tree."
370  );
371  }
372 
373  auto result = std::vector<size_t>( tree.node_count(), 0 );
374 
375  // Recursive helper function that evaluates the wanted size for a given subtree,
376  // stores the result in the vector and returns it for recursive usage.
377  std::function< size_t( TreeLink const* )> rec_subtree_height = [&]( TreeLink const* l ) {
378  size_t link_max = 0;
379  TreeLink const* cl = &l->next();
380  while( cl != l ) {
381  link_max = std::max( link_max, 1 + rec_subtree_height( &cl->outer() ));
382  cl = &cl->next();
383  }
384 
385  result[ l->node().index() ] = link_max;
386  return link_max;
387  };
388 
389  // Loop all subtrees of the given node and find the highest.
390  // This loop is a bit different from the one in the recursive function, as we need to evaluate
391  // all links of the given starting node, instead of just the ones away from the start node.
392  size_t node_max = 0;
393  TreeLink const* cur_l = &node.link();
394  do {
395  node_max = std::max( node_max, 1 + rec_subtree_height( &cur_l->outer() ));
396  cur_l = &cur_l->next();
397  } while( cur_l != &node.link() );
398  result[ node.index() ] = node_max;
399 
400  return result;
401 }
402 
403 std::vector<size_t> subtree_max_path_heights( Tree const& tree )
404 {
405  return subtree_max_path_heights( tree, tree.root_node() );
406 }
407 
408 // =================================================================================================
409 // Misc
410 // =================================================================================================
411 
412 std::vector< TreeLink const* > path_to_root( TreeNode const& node )
413 {
414  std::vector< TreeLink const* > path;
415 
416  // Move towards the root and record all links in between.
417  TreeLink const* cur_link = &node.primary_link();
418  while( &cur_link->edge().secondary_link() == cur_link ) {
419 
420  // The above while condition means: is it the root?! Assert, that the default way of
421  // checking for the root by using the node gives the same result.
422  assert( ! cur_link->node().is_root() );
423 
424  // Assert that the primary direction is correct.
425  assert( cur_link == &cur_link->edge().secondary_link() );
426 
427  // Add the primary link of the current node to the list.
428  path.push_back( cur_link );
429 
430  // Move one node towards the root.
431  // Assert that the default way of finding the next node towards the root (by using
432  // the edge) gives the same result as simply using the link's outer node.
433  // This is the case because the cur link is the one that points towards the root
434  // (which was asserted above).
435  assert( &cur_link->edge().primary_link() == &cur_link->outer() );
436  cur_link = &cur_link->outer().node().primary_link();
437  }
438 
439  // Now finally add the root itself and return the list.
440  assert( cur_link->node().is_root() );
441  path.push_back( cur_link );
442  return path;
443 }
444 
445 TreeNode const& lowest_common_ancestor( TreeNode const& node_a, TreeNode const& node_b )
446 {
447  // Speedup and simplification.
448  if( &node_a == &node_b ) {
449  return node_a;
450  }
451 
452  auto path_a = path_to_root( node_a );
453  auto path_b = path_to_root( node_b );
454 
455  // We must have at least the two original links in the front and the root in the back.
456  assert( path_a.size() > 0 && path_b.size() > 0 );
457  assert( path_a.front() == &node_a.link() );
458  assert( path_b.front() == &node_b.link() );
459  assert( path_a.back() == path_b.back() );
460 
461  // Remove from back as long as the last two elements are the same.
462  // At the end of this, the remaining links are the ones on the path between
463  // the two original links.
464  while(
465  path_a.size() > 1 &&
466  path_b.size() > 1 &&
467  path_a.at( path_a.size() - 1 ) == path_b.at( path_b.size() - 1 ) &&
468  path_a.at( path_a.size() - 2 ) == path_b.at( path_b.size() - 2 )
469  ) {
470  path_a.pop_back();
471  path_b.pop_back();
472  }
473 
474  // Now, the last elements need to be the same (the LCA of the start and finish node).
475  assert( path_a.size() > 0 && path_b.size() > 0 );
476  assert( path_a.back() == path_b.back() );
477 
478  return path_a.back()->node();
479 }
480 
482 {
483  auto const& c_node_a = static_cast< TreeNode const& >( node_a );
484  auto const& c_node_b = static_cast< TreeNode const& >( node_b );
485  return const_cast< TreeNode& >( lowest_common_ancestor( c_node_a, c_node_b ));
486 }
487 
489 {
490  auto res = utils::Matrix<size_t>( tree.node_count(), tree.node_count() );
491 
492  // This is not the best way to calculate all pairwise LCAs.
493  // In the Quartet Scores code, we use range minimum queries and eulertours to achive the
494  // same result in less time. But for now, this code is good enough.
495 
496  // Parallel specialized code.
497  #ifdef GENESIS_OPENMP
498 
499  // We only need to calculate the upper triangle. Get the number of indices needed
500  // to describe this triangle.
501  size_t const max_k = utils::triangular_size( tree.node_count() );
502 
503  #pragma omp parallel for
504  for( size_t k = 0; k < max_k; ++k ) {
505 
506  // For the given linear index, get the actual position in the Matrix.
507  auto const rc = utils::triangular_indices( k, tree.node_count() );
508  auto const r = rc.first;
509  auto const c = rc.second;
510 
511  auto const& lca = lowest_common_ancestor( tree.node_at(r), tree.node_at(c) );
512  res( r, c ) = lca.index();
513  res( c, r ) = lca.index();
514  }
515 
516  // Lastly, because the trinangular indices exluce the diagonale, we need to fill this
517  // by hand. Luckily, those are always the indices themselves, as the LCA of a node
518  // and itself is again itself.
519  #pragma omp parallel for
520  for( size_t d = 0; d < tree.node_count(); ++d ) {
521  res( d, d ) = d;
522  }
523 
524  // If no threads are available at all, use serial version.
525  #else
526 
527  for( size_t r = 0; r < tree.node_count(); ++r ) {
528 
529  // The result is symmetric - we only calculate the upper triangle.
530  for( size_t c = r; c < tree.node_count(); ++c ) {
531 
532  auto const& lca = lowest_common_ancestor( tree.node_at(r), tree.node_at(c) );
533  res( r, c ) = lca.index();
534  res( c, r ) = lca.index();
535  }
536 
537  // See above: the diagonale contains its indices.
538  assert( res( r, r ) == r );
539  }
540 
541  #endif
542 
543  return res;
544 }
545 
546 } // namespace tree
547 } // namespace genesis
std::vector< size_t > inner_node_indices(Tree const &tree)
Get a list of the node indices of all inner TreeNodes.
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node...
utils::Range< IteratorPath< TreeLink const, TreeNode const, TreeEdge const > > path(ElementType const &start, ElementType const &finish)
Definition: path.hpp:325
TreeLink & secondary_link()
Return the TreeLink of this TreeEdge that points away from the root.
Definition: edge.cpp:69
size_t inner_node_count(Tree const &tree)
Count the number of inner Nodes.
TreeLink & link()
Return the TreeLink that points towards the root.
Definition: node.cpp:75
TreeNode const & lowest_common_ancestor(TreeNode const &node_a, TreeNode const &node_b)
Return the lowest common ancestor of two TreeNodes.
std::vector< size_t > leaf_node_indices(Tree const &tree)
Get a list of the node indices of all leaf TreeNodes.
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.cpp:304
size_t max_rank(Tree const &tree)
Return the highest rank of the Nodes of a Tree.
utils::Matrix< signed char > node_root_direction_matrix(Tree const &tree)
Calculate a Matrix that indicates the nodes on the root side of a given node.
Tree operator functions.
bool is_root() const
True iff the node is the root of its Tree.
Definition: node.cpp:209
size_t subtree_max_path_height(Tree const &tree, TreeLink const &link)
Calculate the height of a subtree, that is, the maximum path length to a leaf of that subtree...
size_t leaf_node_count(Tree const &tree)
Count the number of leaf Nodes of a Tree.
utils::Matrix< signed char > edge_sides(Tree const &tree)
Create a Matrix that indiciaces the relative position of the Edges of a Tree, i.e., whether they are on the root side or non-root side.
double sum(const Histogram &h)
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.cpp:484
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
std::vector< size_t > subtree_sizes(Tree const &tree, TreeNode const &node)
Calculate the sizes of all subtrees as seen from the given TreeNode.
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
Matrix operators.
size_t subtree_size(Tree const &tree, TreeLink const &link)
Return the size of the subtree defined by the given TreeLink, measured in number of nodes...
TreeLink & primary_link()
Return the TreeLink that points towards the root.
Definition: node.cpp:56
utils::Matrix< size_t > lowest_common_ancestors(Tree const &tree)
Return the lowest common ancestor of each pair of TreeNodes for a given tree, in form of a Matrix of ...
TreeLink & primary_link()
Return the TreeLink of this TreeEdge that points towards the root.
Definition: edge.cpp:53
utils::Range< IteratorEulertour< TreeLink const, TreeNode const, TreeEdge const > > eulertour(ElementType const &element)
Definition: eulertour.hpp:190
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree...
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
std::vector< size_t > leaf_edge_indices(Tree const &tree)
Get a list of the edge indices of all leaf edges, that is, all TreeEdges that lead to a leaf TreeNode...
std::vector< size_t > subtree_max_path_heights(Tree const &tree, TreeNode const &node)
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
size_t leaf_edge_count(Tree const &tree)
Return the number of Edges of a Tree that lead to a leaf Node.
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
size_t node_count(Tree const &tree)
Return the number of Nodes of a Tree. Same as Tree::node_count().
bool is_bifurcating(Tree const &tree)
Return whether the Tree is bifurcating.
size_t rank() const
Rank of the node, i.e. how many immediate children it has.
Definition: node.cpp:176
Header of Tree class.
Header of Tree distance methods.
std::vector< size_t > inner_edge_indices(Tree const &tree)
Get a list of the edge indices of all inner edges, that is, all TreeEdges that do not lead to a leaf ...
std::vector< TreeLink const * > path_to_root(TreeNode const &node)
Helper function that finds all TreeLinks between a given TreeNode and the root of the Tree...
size_t edge_count(Tree const &tree)
Return the number of Edges of a Tree. Same as Tree::edge_count().