A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/default/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
33 #include "genesis/tree/tree.hpp"
36 
38 
39 #include <algorithm>
40 #include <assert.h>
41 #include <stdexcept>
42 
43 #ifdef GENESIS_OPENMP
44 # include <omp.h>
45 #endif
46 
47 namespace genesis {
48 namespace tree {
49 
50 // =================================================================================================
51 // Branch Distance Measures
52 // =================================================================================================
53 
55  Tree const& tree
56 ) {
57  // Init result matrix.
58  auto const node_count = tree.node_count();
59  utils::Matrix<double> result( node_count, node_count, 0.0 );
60 
61  // Get distances from every node to the root, in branch length units.
62  auto const dists_to_root = node_branch_length_distance_vector( tree );
63 
64  // Get an LCA lookup for the tree.
65  auto const lca_lookup = LcaLookup( tree );
66 
67  // Parallel specialized code.
68  #ifdef GENESIS_OPENMP
69 
70  // We only need to calculate the upper triangle.
71  // Get the number of indices needed to describe this triangle.
72  size_t const max_k = utils::triangular_size( node_count );
73 
74  // Calculate distance matrix for every pair of nodes.
75  #pragma omp parallel for
76  for( size_t k = 0; k < max_k; ++k ) {
77 
78  // For the given linear index, get the actual position in the Matrix.
79  auto const ij = utils::triangular_indices( k, node_count );
80  auto const i = ij.first;
81  auto const j = ij.second;
82 
83  // Make sure we have not touched those entries yet.
84  assert( result(i, j) == 0.0 );
85  assert( result(j, i) == 0.0 );
86 
87  // Calculate and store distance.
88  auto const lca = lca_lookup( i, j );
89  auto const dist = dists_to_root[i] + dists_to_root[j] - 2 * dists_to_root[lca];
90  result(i, j) = dist;
91  result(j, i) = dist;
92  }
93 
94  // If no threads are available at all, use serial version.
95  #else
96 
97  // Calculate distance matrix for every pair of nodes.
98  for( size_t i = 0; i < node_count; ++i ) {
99 
100  // The result is symmetric - we only calculate the upper triangle.
101  for( size_t j = i + 1; j < node_count; ++j ) {
102 
103  // Make sure we have not touched those entries yet.
104  assert( result(i, j) == 0.0 );
105  assert( result(j, i) == 0.0 );
106 
107  // Calculate and store distance.
108  auto const lca = lca_lookup( i, j );
109  auto const dist = dists_to_root[i] + dists_to_root[j] - 2 * dists_to_root[lca];
110  result(i, j) = dist;
111  result(j, i) = dist;
112  }
113  }
114 
115  #endif
116 
117  return result;
118 }
119 
121  Tree const& tree,
122  TreeNode const* node
123 ) {
124  if (!node) {
125  node = &tree.root_node();
126  }
127 
128  // Store the distance from each node to the given node.
129  auto vec = std::vector<double>(tree.node_count(), -1.0);
130  vec[node->index()] = 0.0;
131 
132  // Calculate the distance vector via levelorder iteration.
133  for( auto it : levelorder( *node ) ) {
134  // Skip the starting node (it is already set to 0).
135  if (it.is_first_iteration()) {
136  continue;
137  }
138 
139  // We do not have the distance of the current node, but the one of its "parent" (the one in
140  // direction of the starting node)!
141  assert(vec[it.node().index()] == -1.0);
142  assert(vec[it.link().outer().node().index()] > -1.0);
143 
144  // The distance is the distance from the "parent" node (the next one in direction towards
145  // the starting node) plus the branch length.
146  vec[it.node().index()]
147  = vec[it.link().outer().node().index()]
148  + it.edge().data<DefaultEdgeData>().branch_length;
149  }
150 
151  return vec;
152 }
153 
155  Tree const& tree
156 ) {
157  // Result matrix that will be returned.
158  utils::Matrix<double> mat (tree.edge_count(), tree.edge_count());
159 
160  // For calculating the distance between edges, we use the distances between nodes and for every
161  // pair of edged find the nodes at the ends of the edges that are closest to each other. This
162  // is then the shortest distance between the two edges.
163  // There is probably a way to get this distance via some tree traversal, which would save us
164  // some lookups and calculation of the min, but be more complex and error prone.
165  // For now, this version should be fast enough.
166  auto node_dist_mat = node_branch_length_distance_matrix(tree);
167 
168  #pragma omp parallel for
169  for( size_t i = 0; i < tree.edge_count(); ++i ) {
170  auto const& row_edge = tree.edge_at(i);
171 
172  for (auto col_it = tree.begin_edges(); col_it != tree.end_edges(); ++col_it) {
173  auto const& col_edge = *col_it->get();
174 
175  // Set the diagonal element of the matrix. We don't need to compare nodes in this case,
176  // and particularly want to skip the part where we add half the branch lengths to the
177  // distance. After all, the distance between and element and itself should always be 0!
178  if (row_edge.index() == col_edge.index()) {
179  mat(row_edge.index(), row_edge.index()) = 0.0;
180  continue;
181  }
182 
183  // primary-primary case
184  double pp = node_dist_mat(
185  row_edge.primary_node().index(),
186  col_edge.primary_node().index()
187  );
188 
189  // primary-secondary case
190  double ps = node_dist_mat(
191  row_edge.primary_node().index(),
192  col_edge.secondary_node().index()
193  );
194 
195  // secondary-primary case
196  double sp = node_dist_mat(
197  row_edge.secondary_node().index(),
198  col_edge.primary_node().index()
199  );
200 
201  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
202  // (if this ever happens, the tree is broken).
203  double dist = std::min(pp, std::min(ps, sp));
204  assert(dist <= node_dist_mat(
205  row_edge.secondary_node().index(),
206  col_edge.secondary_node().index()
207  ));
208 
209  // Store in matrix, with halves of the branch lengths.
210  mat(row_edge.index(), col_edge.index())
211  = (dist)
212  + ( row_edge.data<DefaultEdgeData>().branch_length / 2.0 )
213  + ( col_edge.data<DefaultEdgeData>().branch_length / 2.0 )
214  ;
215  }
216  }
217 
218  return mat;
219 }
220 
222  Tree const& tree,
223  TreeEdge const* edge
224 ) {
225  std::vector<double> vec (tree.edge_count());
226 
227  if( edge == nullptr ) {
228  throw std::runtime_error( "Cannot use nullptr edge for distance calulcation." );
229  }
230 
231  // Works similar to edge_branch_length_distance_matrix(). See there for a description of the implementation.
232 
233  // We just need two rows of the distance matrix - let's take the vectors instead for speed.
234  auto p_node_dist = node_branch_length_distance_vector(tree, &edge->primary_node());
235  auto s_node_dist = node_branch_length_distance_vector(tree, &edge->secondary_node());
236 
237  #pragma omp parallel for
238  for( size_t i = 0; i < tree.edge_count(); ++i ) {
239  auto const& col_edge = tree.edge_at(i);
240 
241  if (edge->index() == col_edge.index()) {
242  vec[ edge->index() ] = 0.0;
243  continue;
244  }
245 
246  // primary-primary case
247  double pp = p_node_dist[ col_edge.primary_node().index() ];
248 
249  // primary-secondary case
250  double ps = p_node_dist[ col_edge.secondary_node().index() ];
251 
252  // secondary-primary case
253  double sp = s_node_dist[ col_edge.primary_node().index() ];
254 
255  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
256  // (if this ever happens, the tree is broken).
257  double dist = std::min(pp, std::min(ps, sp));
258  assert(dist <= s_node_dist[ col_edge.secondary_node().index() ]);
259 
260  // Store in vector, with halves of the branch lengths.
261  vec[ col_edge.index() ]
262  = ( dist )
263  + ( edge->data<DefaultEdgeData>().branch_length / 2.0 )
264  + ( col_edge.data<DefaultEdgeData>().branch_length / 2.0 )
265  ;
266  }
267 
268  return vec;
269 }
270 
271 // =================================================================================================
272 // Complex Distance Methods
273 // =================================================================================================
274 
275 double deepest_distance(Tree const& tree)
276 {
277  double max = 0.0;
278  auto leaf_dist = closest_leaf_distance_vector(tree);
279 
280  for( auto const& e : tree.edges() ) {
281  int idx_p = e->primary_node().index();
282  int idx_s = e->secondary_node().index();
283 
284  double d = (leaf_dist[idx_p].second
285  + e->data<DefaultEdgeData>().branch_length
286  + leaf_dist[ idx_s ].second) / 2.0;
287 
288  if (d > max) {
289  max = d;
290  }
291  }
292  return max;
293 }
294 
299 template< class Comparator >
300 std::vector<std::pair< TreeNode const*, double >> leaf_distance_vector(
301  Tree const& tree,
302  utils::Matrix<double> const& node_distances,
303  Comparator comp
304 ) {
305  // prepare a result vector with the size of number of nodes.
306  std::vector<std::pair< TreeNode const*, double>> vec;
307  vec.resize(tree.node_count(), {nullptr, 0.0});
308 
309  if( node_distances.rows() != tree.node_count() || node_distances.cols() != tree.node_count() ) {
310  throw std::runtime_error( "Invalid node_branch_length_distance_matrix." );
311  }
312 
313  // fill the vector for every node.
314  // there is probably a faster way of doing this: preorder traversal with pruning. but for now,
315  // this simple O(n^2) version works.
316  for (auto node_it = tree.begin_nodes(); node_it != tree.end_nodes(); ++node_it) {
317  auto node = node_it->get();
318 
319  // we have not visited this node. assertion holds as long as the indices are correct.
320  assert(vec[node->index()].first == nullptr);
321 
322  TreeNode const* res_node = nullptr;
323  double res_dist = 0.0;
324 
325  // try out all other nodes, and find the closest leaf.
326  for (auto other_it = tree.begin_nodes(); other_it != tree.end_nodes(); ++other_it) {
327  auto other = other_it->get();
328 
329  if (!other->is_leaf()) {
330  continue;
331  }
332 
333  double dist = node_distances(node->index(), other->index());
334  if (res_node == nullptr || comp( dist, res_dist )) {
335  res_node = other;
336  res_dist = dist;
337  }
338  }
339 
340  vec[ node->index() ].first = res_node;
341  vec[ node->index() ].second = res_dist;
342  }
343 
344  return vec;
345 }
346 
347 std::vector<std::pair< TreeNode const*, double >> closest_leaf_distance_vector(
348  Tree const& tree
349 ) {
350  // we need the pairwise distances between all nodes, so we can do quick lookups.
351  auto node_distances = node_branch_length_distance_matrix(tree);
352 
353  return leaf_distance_vector( tree, node_distances, std::less<double>() );
354 }
355 
356 std::vector<std::pair< TreeNode const*, double>> closest_leaf_distance_vector(
357  Tree const& tree,
358  utils::Matrix<double> const& node_distances
359 ) {
360  return leaf_distance_vector( tree, node_distances, std::less<double>() );
361 }
362 
363 std::vector<std::pair< TreeNode const*, double>> furthest_leaf_distance_vector(
364  Tree const& tree
365 ) {
366  // we need the pairwise distances between all nodes, so we can do quick lookups.
367  auto node_distances = node_branch_length_distance_matrix(tree);
368 
369  return leaf_distance_vector( tree, node_distances, std::greater<double>() );
370 }
371 
372 std::vector<std::pair< TreeNode const*, double>> furthest_leaf_distance_vector(
373  Tree const& tree,
374  utils::Matrix<double> const& node_distances
375 ) {
376  return leaf_distance_vector( tree, node_distances, std::greater<double>() );
377 }
378 
379 } // namespace tree
380 } // namespace genesis
IteratorEdges begin_edges()
Definition: tree/tree.cpp:498
IteratorEdges end_edges()
Definition: tree/tree.cpp:503
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
size_t cols() const
Definition: matrix.hpp:156
Fast lookup of the lowest common ancestor (LCA) of two TreeNodes, relative to an arbitrary root node...
Definition: lca_lookup.hpp:66
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
utils::Range< IteratorLevelorder< TreeLink const, TreeNode const, TreeEdge const > > levelorder(ElementType const &element)
utils::Matrix< double > edge_branch_length_distance_matrix(Tree const &tree)
IteratorNodes end_nodes()
Definition: tree/tree.cpp:469
Default class containing the commonly needed data for tree edges.
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
std::vector< std::pair< TreeNode const *, double > > leaf_distance_vector(Tree const &tree, utils::Matrix< double > const &node_distances, Comparator comp)
Local helper function to calculate either closest_leaf_distance_vector() or furthest_leaf_distance_ve...
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t triangular_size(size_t n)
Calculate the number of linear indices needed for a triangular Matrix of size n.
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.cpp:101
IteratorNodes begin_nodes()
Definition: tree/tree.cpp:464
std::vector< double > edge_branch_length_distance_vector(Tree const &tree, TreeEdge const *edge)
Header of Default Tree distance methods.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
std::vector< std::pair< TreeNode const *, double > > furthest_leaf_distance_vector(Tree const &tree)
Opposite of closest_leaf_distance_vector().
double branch_length
Branch length of the edge.
std::pair< size_t, size_t > triangular_indices(size_t k, size_t n)
Given a linear index in a upper triangular Matrix, find the corresponding Matrix indices.
size_t rows() const
Definition: matrix.hpp:151
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
Matrix operators.
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.cpp:85
double deepest_distance(Tree const &tree)
Return the longest distance from any point in the tree (on the edges) to any leaf.
std::vector< std::pair< TreeNode const *, double > > closest_leaf_distance_vector(Tree const &tree)
Return a vector containing the closest leaf node for each node, using the branch_length as distance m...
std::vector< double > node_branch_length_distance_vector(Tree const &tree, TreeNode const *node)
Return a vector containing the distance of all nodes with respect to the given start node...
Header of Tree class.
EdgeDataType & data()
Definition: edge.hpp:118