A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/function/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
39 
40 #include <algorithm>
41 #include <cassert>
42 #include <functional>
43 #include <unordered_set>
44 #include <vector>
45 
46 #ifdef GENESIS_OPENMP
47 # include <omp.h>
48 #endif
49 
50 namespace genesis {
51 namespace tree {
52 
53 // =================================================================================================
54 // Node Count Properties
55 // =================================================================================================
56 
57 size_t max_rank( Tree const& tree )
58 {
59  size_t max = 0;
60  for( size_t i = 0; i < tree.node_count(); ++i ) {
61  max = std::max( max, tree.node_at(i).rank() );
62  }
63  return max;
64 }
65 
66 bool is_bifurcating( Tree const& tree )
67 {
68  return max_rank( tree ) == 2;
69 }
70 
71 size_t leaf_node_count( Tree const& tree )
72 {
73  size_t sum = 0;
74  for (size_t i = 0; i < tree.node_count(); ++i) {
75  auto const& n = tree.node_at(i);
76  if (n.is_leaf()) {
77  ++sum;
78  }
79  }
80  return sum;
81 }
82 
83 size_t inner_node_count( Tree const& tree )
84 {
85  return tree.node_count() - leaf_node_count( tree );
86 }
87 
88 size_t leaf_edge_count( Tree const& tree )
89 {
90  size_t sum = 0;
91  for( auto const& edge : tree.edges() ) {
92  if( edge->primary_node().is_leaf() || edge->secondary_node().is_leaf() ) {
93  ++sum;
94  }
95  }
96  return sum;
97 }
98 
99 size_t inner_edge_count( Tree const& tree )
100 {
101  size_t sum = 0;
102  for( auto const& edge : tree.edges() ) {
103  if( edge->primary_node().is_inner() && edge->secondary_node().is_inner() ) {
104  ++sum;
105  }
106  }
107  return sum;
108 }
109 
110 // =================================================================================================
111 // Tree Sides
112 // =================================================================================================
113 
115 {
116  // Get a quadratic matrix: for each edge, it gives a value whether each other edge is
117  // proximal (1) or distal (-1) relative to itself (0).
118  auto result = utils::Matrix<signed char>( tree.edge_count(), tree.edge_count(), 0 );
119 
120  // Helper function that traverses the subtree starting at a link,
121  // and for each edge in the subtree, sets its entry in the matrix to the given sign.
122  auto traverse = [&]( TreeLink const& start_link, size_t i, signed char sign ){
123  auto link = &( start_link.next() );
124  while( link != &start_link ) {
125  result( i, link->edge().index() ) = sign;
126  link = &( link->outer().next() );
127  }
128  };
129 
130  // For each edge, do the traversal in both directions and set the signs.
131  // This can probably be done more efficiently with only one smart traversal of the whole tree,
132  // but this function is not needed often enough right now.
133  for( size_t i = 0; i < tree.edge_count(); ++i ) {
134  traverse( tree.edge_at(i).primary_link(), i, -1 );
135  traverse( tree.edge_at(i).secondary_link(), i, 1 );
136  }
137 
138  return result;
139 }
140 
142 {
143  auto mat = utils::Matrix<signed char>( tree.node_count(), tree.node_count(), 0 );
144 
145  // Fill every row of the matrix.
146  #pragma omp parallel for
147  for( size_t i = 0; i < tree.node_count(); ++i ) {
148  auto const& row_node = tree.node_at(i);
149  auto const row_index = row_node.index();
150  auto const primary_link = &row_node.primary_link();
151 
152  // Fill root side subtree with 1s.
153  // We do set the value of inner nodes multiple times, but that's no problem.
154  // Also, we need to do an extra check for the root here, in order to set all
155  // subtrees of the root to -1.
156  signed char const value = row_node.is_root() ? -1 : 1;
157  auto current_link = &( primary_link->outer() );
158  while( current_link != primary_link ) {
159  mat( row_index, current_link->node().index() ) = value;
160  current_link = &( current_link->next().outer() );
161  }
162 
163  // Fill all non-root side subtrees with -1s.
164  // We explicitly go through all non-root links of the node, in order to be really clear
165  // about our intentions. It would also work to simply follow the link chain until
166  // we reach the primary link again. However, this would also set -1 to our row node,
167  // thus we'd have to reset it, making the algorithm a bit messy.
168  // So, to be clear and clean, we avoid this.
169  auto sub_link = &( primary_link->next() );
170  while( sub_link != primary_link ) {
171 
172  // Now, for a given non-root subtree, set everything to -1.
173  current_link = &( sub_link->outer() );
174  while( current_link != sub_link ) {
175  mat( row_index, current_link->node().index() ) = -1;
176  current_link = &( current_link->next().outer() );
177  }
178 
179  // Go to next subtree.
180  sub_link = &( sub_link->next() );
181  }
182 
183  // Check that the diagonal element is untouched.
184  assert( mat( row_index, row_index ) == 0 );
185  }
186 
187  return mat;
188 }
189 
190 // =================================================================================================
191 // Subtrees
192 // =================================================================================================
193 
194 size_t subtree_size( Tree const& tree, TreeLink const& link )
195 {
196  if( ! belongs_to( tree, link )) {
197  throw std::runtime_error(
198  "Cannot caluclate subtree_size, as the given Link does not belong to the Tree."
199  );
200  }
201 
202  // TODO This is a quick and dirty solution. Traverse the whole subtree, add all nodes to a set
203  // and simply return the size of that set.
204 
205  std::unordered_set< TreeNode const* > visited_nodes;
206 
207  auto cur_link = &link.outer();
208  while( cur_link != &link ) {
209  visited_nodes.insert( &cur_link->node() );
210  cur_link = &cur_link->next().outer();
211  }
212 
213  return visited_nodes.size();
214 }
215 
216 std::vector<size_t> subtree_sizes( Tree const& tree, TreeNode const& node )
217 {
218  if( ! belongs_to( tree, node )) {
219  throw std::runtime_error(
220  "Cannot calculate subtree_sizes(), as the given Node does not belong to the Tree."
221  );
222  }
223 
224  // TODO this is an overly complex and thus error prone solution, maybe there is a better way?!
225 
226  // Prepare result vector.
227  std::vector<size_t> result;
228  result.resize( tree.node_count(), 0 );
229 
230  // We use a stack to track the subtree sizes.
231  // We store the entry link of the preorder traversal of the nodes. The entry link is the one
232  // that is given when visiting the node first while doing a eulertour traversal of the tree.
233  // This is always the next() link after the towards-the-starting-node/root link.
234  std::vector< TreeLink const* > stack;
235  stack.push_back( &node.link() );
236 
237  // Traverse the tree.
238  for( auto const& it : eulertour( node )) {
239 
240  // If this is the last time we visit that node on our way back up the tree.
241  // (The second part of the condition checks whether it is the starting node, because
242  // in this case, we do not want to remove it.)
243  if( &it.link().next() == stack.back() && stack.back() != &node.link() ) {
244 
245  // We finished with a subtree. Add the cummulative number of children of that subtree
246  // to the parent node, and remove the parent from the stack (as we are done with it).
247  auto st_size = result[ stack.back()->node().index() ];
248  stack.pop_back();
249  result[ stack.back()->node().index() ] += st_size;
250 
251  // If this node is already the current top stack element.
252  } else if( &it.node() == &stack.back()->node() ) {
253 
254  // Do nothing.
255 
256  // If it is a leaf.
257  } else if( it.link().is_leaf() ) {
258 
259  // Simply increment its parent's counter.
260  ++result[ stack.back()->node().index() ];
261 
262  // If we will visit that node in the future again.
263  } else {
264 
265  // Add a count for the immediate child (i.e., the current node) to the current stack
266  // end (i.e., increment the counter of children of that node),
267  // then add the current node itself to the stack, so that in the next iteration,
268  // we will increase its counts.
269  ++result[ stack.back()->node().index() ];
270  stack.push_back( &it.link() );
271  }
272  }
273 
274  // The stack now should contain only a single node, which is the starting node itself.
275  assert( stack.size() == 1 && stack.back() == &node.link() );
276 
277  // The size of the subtree of the starting node is always the number of nodes in the tree
278  // minus one for that node itself (as it is not counted as part of its subtree).
279  assert( result[ node.index() ] = tree.node_count() - 1 );
280 
281  return result;
282 }
283 
284 std::vector<size_t> subtree_sizes( Tree const& tree )
285 {
286  return subtree_sizes( tree, tree.root_node() );
287 }
288 
289 size_t subtree_max_path_height( Tree const& tree, TreeLink const& link )
290 {
291  if( ! belongs_to( tree, link )) {
292  throw std::runtime_error(
293  "Cannot calculate subtree_max_path_height(), "
294  "as the given Link does not belong to the Tree."
295  );
296  }
297 
298  // TODO make more efficient. no need for full dist vector.
299  auto dists = node_path_length_vector( tree, link.outer().node() );
300  size_t max = 0;
301 
302  auto cur_link = &link.outer();
303  while( cur_link != &link ) {
304  max = std::max( max, dists[ cur_link->node().index() ] );
305  cur_link = &cur_link->next().outer();
306  }
307  return max;
308 }
309 
310 std::vector<size_t> subtree_max_path_heights( Tree const& tree, TreeNode const& node )
311 {
312  if( ! belongs_to( tree, node )) {
313  throw std::runtime_error(
314  "Cannot calculate subtree_max_path_heights(), "
315  "as the given Node does not belong to the Tree."
316  );
317  }
318 
319  auto result = std::vector<size_t>( tree.node_count(), 0 );
320 
321  // Recursive helper function that evaluates the wanted size for a given subtree,
322  // stores the result in the vector and returns it for recursive usage.
323  std::function< size_t( TreeLink const* )> rec_subtree_height = [&]( TreeLink const* l ) {
324  size_t link_max = 0;
325  TreeLink const* cl = &l->next();
326  while( cl != l ) {
327  link_max = std::max( link_max, 1 + rec_subtree_height( &cl->outer() ));
328  cl = &cl->next();
329  }
330 
331  result[ l->node().index() ] = link_max;
332  return link_max;
333  };
334 
335  // Loop all subtrees of the given node and find the highest.
336  // This loop is a bit different from the one in the recursive function, as we need to evaluate
337  // all links of the given starting node, instead of just the ones away from the start node.
338  size_t node_max = 0;
339  TreeLink const* cur_l = &node.link();
340  do {
341  node_max = std::max( node_max, 1 + rec_subtree_height( &cur_l->outer() ));
342  cur_l = &cur_l->next();
343  } while( cur_l != &node.link() );
344  result[ node.index() ] = node_max;
345 
346  return result;
347 }
348 
349 std::vector<size_t> subtree_max_path_heights( Tree const& tree )
350 {
351  return subtree_max_path_heights( tree, tree.root_node() );
352 }
353 
354 // =================================================================================================
355 // Misc
356 // =================================================================================================
357 
358 std::vector< TreeLink const* > path_to_root( TreeNode const& node )
359 {
360  std::vector< TreeLink const* > path;
361 
362  // Move towards the root and record all links in between.
363  TreeLink const* cur_link = &node.primary_link();
364  while( &cur_link->edge().secondary_link() == cur_link ) {
365 
366  // The above while condition means: is it the root?! Assert, that the default way of
367  // checking for the root by using the node gives the same result.
368  assert( ! cur_link->node().is_root() );
369 
370  // Assert that the primary direction is correct.
371  assert( cur_link == &cur_link->edge().secondary_link() );
372 
373  // Add the primary link of the current node to the list.
374  path.push_back( cur_link );
375 
376  // Move one node towards the root.
377  // Assert that the default way of finding the next node towards the root (by using
378  // the edge) gives the same result as simply using the link's outer node.
379  // This is the case because the cur link is the one that points towards the root
380  // (which was asserted above).
381  assert( &cur_link->edge().primary_link() == &cur_link->outer() );
382  cur_link = &cur_link->outer().node().primary_link();
383  }
384 
385  // Now finally add the root itself and return the list.
386  assert( cur_link->node().is_root() );
387  path.push_back( cur_link );
388  return path;
389 }
390 
391 TreeNode const& lowest_common_ancestor( TreeNode const& node_a, TreeNode const& node_b )
392 {
393  // Speedup and simplification.
394  if( &node_a == &node_b ) {
395  return node_a;
396  }
397 
398  auto path_a = path_to_root( node_a );
399  auto path_b = path_to_root( node_b );
400 
401  // We must have at least the two original links in the front and the root in the back.
402  assert( path_a.size() > 0 && path_b.size() > 0 );
403  assert( path_a.front() == &node_a.link() );
404  assert( path_b.front() == &node_b.link() );
405  assert( path_a.back() == path_b.back() );
406 
407  // Remove from back as long as the last two elements are the same.
408  // At the end of this, the remaining links are the ones on the path between
409  // the two original links.
410  while(
411  path_a.size() > 1 &&
412  path_b.size() > 1 &&
413  path_a.at( path_a.size() - 1 ) == path_b.at( path_b.size() - 1 ) &&
414  path_a.at( path_a.size() - 2 ) == path_b.at( path_b.size() - 2 )
415  ) {
416  path_a.pop_back();
417  path_b.pop_back();
418  }
419 
420  // Now, the last elements need to be the same (the LCA of the start and finish node).
421  assert( path_a.size() > 0 && path_b.size() > 0 );
422  assert( path_a.back() == path_b.back() );
423 
424  return path_a.back()->node();
425 }
426 
428 {
429  auto const& c_node_a = static_cast< TreeNode const& >( node_a );
430  auto const& c_node_b = static_cast< TreeNode const& >( node_b );
431  return const_cast< TreeNode& >( lowest_common_ancestor( c_node_a, c_node_b ));
432 }
433 
435 {
436  auto res = utils::Matrix<size_t>( tree.node_count(), tree.node_count() );
437 
438  // This is not the best way to calculate all pairwise LCAs.
439  // In the Quartet Scores code, we use range minimum queries and eulertours to achive the
440  // same result in less time. But for now, this code is good enough.
441 
442  // Parallel specialized code.
443  #ifdef GENESIS_OPENMP
444 
445  // We only need to calculate the upper triangle. Get the number of indices needed
446  // to describe this triangle.
447  size_t const max_k = utils::triangular_size( tree.node_count() );
448 
449  #pragma omp parallel for
450  for( size_t k = 0; k < max_k; ++k ) {
451 
452  // For the given linear index, get the actual position in the Matrix.
453  auto const rc = utils::triangular_indices( k, tree.node_count() );
454  auto const r = rc.first;
455  auto const c = rc.second;
456 
457  auto const& lca = lowest_common_ancestor( tree.node_at(r), tree.node_at(c) );
458  res( r, c ) = lca.index();
459  res( c, r ) = lca.index();
460  }
461 
462  // Lastly, because the trinangular indices exluce the diagonale, we need to fill this
463  // by hand. Luckily, those are always the indices themselves, as the LCA of a node
464  // and itself is again itself.
465  #pragma omp parallel for
466  for( size_t d = 0; d < tree.node_count(); ++d ) {
467  res( d, d ) = d;
468  }
469 
470  // If no threads are available at all, use serial version.
471  #else
472 
473  for( size_t r = 0; r < tree.node_count(); ++r ) {
474 
475  // The result is symmetric - we only calculate the upper triangle.
476  for( size_t c = r; c < tree.node_count(); ++c ) {
477 
478  auto const& lca = lowest_common_ancestor( tree.node_at(r), tree.node_at(c) );
479  res( r, c ) = lca.index();
480  res( c, r ) = lca.index();
481  }
482 
483  // See above: the diagonale contains its indices.
484  assert( res( r, r ) == r );
485  }
486 
487  #endif
488 
489  return res;
490 }
491 
492 } // namespace tree
493 } // namespace genesis
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node...
utils::Range< IteratorPath< TreeLink const, TreeNode const, TreeEdge const > > path(ElementType const &start, ElementType const &finish)
Definition: path.hpp:325
TreeLink & secondary_link()
Return the TreeLink of this TreeEdge that points away from the root.
Definition: edge.cpp:69
size_t inner_node_count(Tree const &tree)
Count the number of inner Nodes.
TreeLink & link()
Return the TreeLink that points towards the root.
Definition: node.cpp:75
TreeNode const & lowest_common_ancestor(TreeNode const &node_a, TreeNode const &node_b)
Return the lowest common ancestor of two TreeNodes.
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.cpp:304
size_t max_rank(Tree const &tree)
Return the highest rank of the Nodes of a Tree.
utils::Matrix< signed char > node_root_direction_matrix(Tree const &tree)
Calculate a Matrix that indicates the nodes on the root side of a given node.
Tree operator functions.
bool is_root() const
True iff the node is the root of its Tree.
Definition: node.cpp:209
size_t subtree_max_path_height(Tree const &tree, TreeLink const &link)
Calculate the height of a subtree, that is, the maximum path length to a leaf of that subtree...
size_t leaf_node_count(Tree const &tree)
Count the number of leaf Nodes of a Tree.
utils::Matrix< signed char > edge_sides(Tree const &tree)
Create a Matrix that indiciaces the relative position of the Edges of a Tree, i.e., whether they are on the root side or non-root side.
double sum(const Histogram &h)
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
std::vector< size_t > subtree_sizes(Tree const &tree, TreeNode const &node)
Calculate the sizes of all subtrees as seen from the given TreeNode.
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
size_t subtree_size(Tree const &tree, TreeLink const &link)
Return the size of the subtree defined by the given TreeLink, measured in number of nodes...
TreeLink & primary_link()
Return the TreeLink that points towards the root.
Definition: node.cpp:56
utils::Matrix< size_t > lowest_common_ancestors(Tree const &tree)
Return the lowest common ancestor of each pair of TreeNodes for a given tree, in form of a Matrix of ...
TreeLink & primary_link()
Return the TreeLink of this TreeEdge that points towards the root.
Definition: edge.cpp:53
utils::Range< IteratorEulertour< TreeLink const, TreeNode const, TreeEdge const > > eulertour(ElementType const &element)
Definition: eulertour.hpp:190
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree...
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
std::vector< size_t > subtree_max_path_heights(Tree const &tree, TreeNode const &node)
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
size_t leaf_edge_count(Tree const &tree)
Return the number of Edges of a Tree that lead to a leaf Node.
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
Matrix operators.
bool is_bifurcating(Tree const &tree)
Return whether the Tree is bifurcating.
size_t rank() const
Rank of the node, i.e. how many immediate children it has.
Definition: node.cpp:176
Header of Tree class.
Header of Tree distance methods.
std::vector< TreeLink const * > path_to_root(TreeNode const &node)
Helper function that finds all TreeLinks between a given TreeNode and the root of the Tree...