A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/default/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
39 
40 #include <algorithm>
41 #include <assert.h>
42 #include <stdexcept>
43 
44 #ifdef GENESIS_OPENMP
45 # include <omp.h>
46 #endif
47 
48 namespace genesis {
49 namespace tree {
50 
51 // =================================================================================================
52 // Branch Distance Measures
53 // =================================================================================================
54 
56  Tree const& tree
57 ) {
58  // Init result matrix.
59  auto const node_count = tree.node_count();
61 
62  // Get distances from every node to the root, in branch length units.
63  auto const dists_to_root = node_branch_length_distance_vector( tree );
64 
65  // Get an LCA lookup for the tree.
66  auto const lca_lookup = LcaLookup( tree );
67 
68  // Parallel specialized code.
69  #ifdef GENESIS_OPENMP
70 
71  // We only need to calculate the upper triangle.
72  // Get the number of indices needed to describe this triangle.
73  size_t const max_k = utils::triangular_size( node_count );
74 
75  // Calculate distance matrix for every pair of nodes.
76  #pragma omp parallel for
77  for( size_t k = 0; k < max_k; ++k ) {
78 
79  // For the given linear index, get the actual position in the Matrix.
80  auto const ij = utils::triangular_indices( k, node_count );
81  auto const i = ij.first;
82  auto const j = ij.second;
83 
84  // Make sure we have not touched those entries yet.
85  assert( result(i, j) == 0.0 );
86  assert( result(j, i) == 0.0 );
87 
88  // Calculate and store distance.
89  auto const lca = lca_lookup( i, j );
90  auto const dist = dists_to_root[i] + dists_to_root[j] - 2 * dists_to_root[lca];
91  result(i, j) = dist;
92  result(j, i) = dist;
93  }
94 
95  // If no threads are available at all, use serial version.
96  #else
97 
98  // Calculate distance matrix for every pair of nodes.
99  for( size_t i = 0; i < node_count; ++i ) {
100 
101  // The result is symmetric - we only calculate the upper triangle.
102  for( size_t j = i + 1; j < node_count; ++j ) {
103 
104  // Make sure we have not touched those entries yet.
105  assert( result(i, j) == 0.0 );
106  assert( result(j, i) == 0.0 );
107 
108  // Calculate and store distance.
109  auto const lca = lca_lookup( i, j );
110  auto const dist = dists_to_root[i] + dists_to_root[j] - 2 * dists_to_root[lca];
111  result(i, j) = dist;
112  result(j, i) = dist;
113  }
114  }
115 
116  #endif
117 
118  return result;
119 }
120 
122  Tree const& tree,
123  TreeNode const* node
124 ) {
125  if (!node) {
126  node = &tree.root_node();
127  }
128 
129  // Store the distance from each node to the given node.
130  auto vec = std::vector<double>(tree.node_count(), -1.0);
131  vec[node->index()] = 0.0;
132 
133  // Calculate the distance vector via levelorder iteration.
134  for( auto it : levelorder( *node ) ) {
135  // Skip the starting node (it is already set to 0).
136  if (it.is_first_iteration()) {
137  continue;
138  }
139 
140  // We do not have the distance of the current node, but the one of its "parent" (the one in
141  // direction of the starting node)!
142  assert(vec[it.node().index()] == -1.0);
143  assert(vec[it.link().outer().node().index()] > -1.0);
144 
145  // The distance is the distance from the "parent" node (the next one in direction towards
146  // the starting node) plus the branch length.
147  vec[it.node().index()]
148  = vec[it.link().outer().node().index()]
149  + it.edge().data<DefaultEdgeData>().branch_length;
150  }
151 
152  return vec;
153 }
154 
156  Tree const& tree
157 ) {
158  // Result matrix that will be returned.
159  utils::Matrix<double> mat (tree.edge_count(), tree.edge_count());
160 
161  // For calculating the distance between edges, we use the distances between nodes and for every
162  // pair of edged find the nodes at the ends of the edges that are closest to each other. This
163  // is then the shortest distance between the two edges.
164  // There is probably a way to get this distance via some tree traversal, which would save us
165  // some lookups and calculation of the min, but be more complex and error prone.
166  // For now, this version should be fast enough.
167  auto node_dist_mat = node_branch_length_distance_matrix(tree);
168 
169  #pragma omp parallel for
170  for( size_t i = 0; i < tree.edge_count(); ++i ) {
171  auto const& row_edge = tree.edge_at(i);
172 
173  for (auto col_it = tree.begin_edges(); col_it != tree.end_edges(); ++col_it) {
174  auto const& col_edge = *col_it->get();
175 
176  // Set the diagonal element of the matrix. We don't need to compare nodes in this case,
177  // and particularly want to skip the part where we add half the branch lengths to the
178  // distance. After all, the distance between and element and itself should always be 0!
179  if (row_edge.index() == col_edge.index()) {
180  mat(row_edge.index(), row_edge.index()) = 0.0;
181  continue;
182  }
183 
184  // primary-primary case
185  double pp = node_dist_mat(
186  row_edge.primary_node().index(),
187  col_edge.primary_node().index()
188  );
189 
190  // primary-secondary case
191  double ps = node_dist_mat(
192  row_edge.primary_node().index(),
193  col_edge.secondary_node().index()
194  );
195 
196  // secondary-primary case
197  double sp = node_dist_mat(
198  row_edge.secondary_node().index(),
199  col_edge.primary_node().index()
200  );
201 
202  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
203  // (if this ever happens, the tree is broken).
204  double dist = std::min(pp, std::min(ps, sp));
205  assert(dist <= node_dist_mat(
206  row_edge.secondary_node().index(),
207  col_edge.secondary_node().index()
208  ));
209 
210  // Store in matrix, with halves of the branch lengths.
211  mat(row_edge.index(), col_edge.index())
212  = (dist)
213  + ( row_edge.data<DefaultEdgeData>().branch_length / 2.0 )
214  + ( col_edge.data<DefaultEdgeData>().branch_length / 2.0 )
215  ;
216  }
217  }
218 
219  return mat;
220 }
221 
223  Tree const& tree,
224  TreeEdge const* edge
225 ) {
226  std::vector<double> vec (tree.edge_count());
227 
228  if( edge == nullptr ) {
229  throw std::runtime_error( "Cannot use nullptr edge for distance calulcation." );
230  }
231 
232  // Works similar to edge_branch_length_distance_matrix(). See there for a description of the implementation.
233 
234  // We just need two rows of the distance matrix - let's take the vectors instead for speed.
235  auto p_node_dist = node_branch_length_distance_vector(tree, &edge->primary_node());
236  auto s_node_dist = node_branch_length_distance_vector(tree, &edge->secondary_node());
237 
238  #pragma omp parallel for
239  for( size_t i = 0; i < tree.edge_count(); ++i ) {
240  auto const& col_edge = tree.edge_at(i);
241 
242  if (edge->index() == col_edge.index()) {
243  vec[ edge->index() ] = 0.0;
244  continue;
245  }
246 
247  // primary-primary case
248  double pp = p_node_dist[ col_edge.primary_node().index() ];
249 
250  // primary-secondary case
251  double ps = p_node_dist[ col_edge.secondary_node().index() ];
252 
253  // secondary-primary case
254  double sp = s_node_dist[ col_edge.primary_node().index() ];
255 
256  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
257  // (if this ever happens, the tree is broken).
258  double dist = std::min(pp, std::min(ps, sp));
259  assert(dist <= s_node_dist[ col_edge.secondary_node().index() ]);
260 
261  // Store in vector, with halves of the branch lengths.
262  vec[ col_edge.index() ]
263  = ( dist )
264  + ( edge->data<DefaultEdgeData>().branch_length / 2.0 )
265  + ( col_edge.data<DefaultEdgeData>().branch_length / 2.0 )
266  ;
267  }
268 
269  return vec;
270 }
271 
272 // =================================================================================================
273 // Complex Distance Methods
274 // =================================================================================================
275 
276 double deepest_distance(Tree const& tree)
277 {
278  double max = 0.0;
279  auto leaf_dist = closest_leaf_distance_vector(tree);
280 
281  for( auto const& e : tree.edges() ) {
282  int idx_p = e->primary_node().index();
283  int idx_s = e->secondary_node().index();
284 
285  double d = (leaf_dist[idx_p].second
286  + e->data<DefaultEdgeData>().branch_length
287  + leaf_dist[ idx_s ].second) / 2.0;
288 
289  if (d > max) {
290  max = d;
291  }
292  }
293  return max;
294 }
295 
300 template< class Comparator >
301 std::vector<std::pair< TreeNode const*, double >> leaf_distance_vector(
302  Tree const& tree,
303  utils::Matrix<double> const& node_distances,
304  Comparator comp
305 ) {
306  // prepare a result vector with the size of number of nodes.
307  std::vector<std::pair< TreeNode const*, double>> vec;
308  vec.resize(tree.node_count(), {nullptr, 0.0});
309 
310  if( node_distances.rows() != tree.node_count() || node_distances.cols() != tree.node_count() ) {
311  throw std::runtime_error( "Invalid node_branch_length_distance_matrix." );
312  }
313 
314  // fill the vector for every node.
315  // there is probably a faster way of doing this: preorder traversal with pruning. but for now,
316  // this simple O(n^2) version works.
317  for( auto node_it = tree.begin_nodes(); node_it != tree.end_nodes(); ++node_it ) {
318  auto node = node_it->get();
319 
320  // we have not visited this node. assertion holds as long as the indices are correct.
321  assert(vec[node->index()].first == nullptr);
322 
323  TreeNode const* res_node = nullptr;
324  double res_dist = 0.0;
325 
326  // try out all other nodes, and find the closest leaf.
327  for( auto other_it = tree.begin_nodes(); other_it != tree.end_nodes(); ++other_it ) {
328  auto other = other_it->get();
329 
330  if(! is_leaf( *other )) {
331  continue;
332  }
333 
334  double dist = node_distances(node->index(), other->index());
335  if( res_node == nullptr || comp( dist, res_dist )) {
336  res_node = other;
337  res_dist = dist;
338  }
339  }
340 
341  vec[ node->index() ].first = res_node;
342  vec[ node->index() ].second = res_dist;
343  }
344 
345  return vec;
346 }
347 
348 std::vector<std::pair< TreeNode const*, double >> closest_leaf_distance_vector(
349  Tree const& tree
350 ) {
351  // we need the pairwise distances between all nodes, so we can do quick lookups.
352  auto node_distances = node_branch_length_distance_matrix(tree);
353 
354  return leaf_distance_vector( tree, node_distances, std::less<double>() );
355 }
356 
357 std::vector<std::pair< TreeNode const*, double>> closest_leaf_distance_vector(
358  Tree const& tree,
359  utils::Matrix<double> const& node_distances
360 ) {
361  return leaf_distance_vector( tree, node_distances, std::less<double>() );
362 }
363 
364 std::vector<std::pair< TreeNode const*, double>> furthest_leaf_distance_vector(
365  Tree const& tree
366 ) {
367  // we need the pairwise distances between all nodes, so we can do quick lookups.
368  auto node_distances = node_branch_length_distance_matrix(tree);
369 
370  return leaf_distance_vector( tree, node_distances, std::greater<double>() );
371 }
372 
373 std::vector<std::pair< TreeNode const*, double>> furthest_leaf_distance_vector(
374  Tree const& tree,
375  utils::Matrix<double> const& node_distances
376 ) {
377  return leaf_distance_vector( tree, node_distances, std::greater<double>() );
378 }
379 
380 } // namespace tree
381 } // namespace genesis
IteratorEdges begin_edges()
Definition: tree/tree.cpp:498
IteratorEdges end_edges()
Definition: tree/tree.cpp:503
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:105
Fast lookup of the lowest common ancestor (LCA) of two TreeNodes, relative to an arbitrary root node...
Definition: lca_lookup.hpp:66
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.hpp:161
size_t index() const
Return the index of this Node.
Definition: node.hpp:100
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
utils::Range< IteratorLevelorder< TreeLink const, TreeNode const, TreeEdge const > > levelorder(ElementType const &element)
utils::Matrix< double > edge_branch_length_distance_matrix(Tree const &tree)
IteratorNodes end_nodes()
Definition: tree/tree.cpp:469
Default class containing the commonly needed data for tree edges.
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
Matrix operators.
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.hpp:145
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
std::vector< std::pair< TreeNode const *, double > > leaf_distance_vector(Tree const &tree, utils::Matrix< double > const &node_distances, Comparator comp)
Local helper function to calculate either closest_leaf_distance_vector() or furthest_leaf_distance_ve...
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
IteratorNodes begin_nodes()
Definition: tree/tree.cpp:464
std::vector< double > edge_branch_length_distance_vector(Tree const &tree, TreeEdge const *edge)
Header of Default Tree distance methods.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
std::vector< std::pair< TreeNode const *, double > > furthest_leaf_distance_vector(Tree const &tree)
Opposite of closest_leaf_distance_vector().
double branch_length
Branch length of the edge.
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
size_t node_count(Tree const &tree)
Return the number of Nodes of a Tree. Same as Tree::node_count().
double deepest_distance(Tree const &tree)
Return the longest distance from any point in the tree (on the edges) to any leaf.
std::vector< std::pair< TreeNode const *, double > > closest_leaf_distance_vector(Tree const &tree)
Return a vector containing the closest leaf node for each node, using the branch_length as distance m...
std::vector< double > node_branch_length_distance_vector(Tree const &tree, TreeNode const *node)
Return a vector containing the distance of all nodes with respect to the given start node...
Header of Tree class.
EdgeDataType & data()
Definition: edge.hpp:183