A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/function/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 #include "genesis/tree/tree.hpp"
36 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <limits>
42 #include <stdexcept>
43 
44 #ifdef GENESIS_OPENMP
45 # include <omp.h>
46 #endif
47 
48 namespace genesis {
49 namespace tree {
50 
51 // =================================================================================================
52 // Node Distance Measures
53 // =================================================================================================
54 
56  Tree const& tree
57 ) {
58  auto max_val = std::numeric_limits<size_t>::max();
59  utils::Matrix<size_t> mat( tree.node_count(), tree.node_count(), max_val );
60 
61  // Fill every row of the matrix.
62  #pragma omp parallel for
63  for( size_t ni = 0; ni < tree.node_count(); ++ni ) {
64  auto const& row_node = tree.node_at( ni );
65  assert( row_node.index() == ni );
66 
67  // Set the diagonal element of the matrix.
68  mat( row_node.index(), row_node.index() ) = 0;
69 
70  // The columns are filled using a levelorder traversal. This makes sure that for every node
71  // we know how to calculate the distance to the current row node.
72  // Unfortunately, this prevents us from simply calculating the upper triangle of the matrix
73  // and copying it (distance is symmetric), because we do not really know which nodes are in
74  // which half during a levelorder traversal...
75  for( auto it : levelorder( row_node.link() )) {
76  // Skip the diagonal of the matrix.
77  if (it.is_first_iteration()) {
78  assert( it.node().index() == row_node.index() );
79  continue;
80  }
81 
82  // Make sure we don't have touched the current position, but have calculated
83  // the needed dependency already.
84  assert( mat(row_node.index(), it.node().index()) == max_val );
85  assert( mat(row_node.index(), it.link().outer().node().index()) != max_val );
86 
87  // The distance to the current row node is one more than the distance from the other
88  // end of that branch to the row node.
89  mat( row_node.index(), it.node().index() )
90  = 1 + mat(row_node.index(), it.link().outer().node().index());
91  }
92  }
93 
94  return mat;
95 }
96 
97 std::vector<size_t> node_path_length_vector(
98  Tree const& tree,
99  TreeNode const& node
100 ) {
101  if( ! belongs_to( tree, node )) {
102  throw std::runtime_error(
103  "Cannot caluclate node_path_length_vector, as the given Node does not belong to the Tree."
104  );
105  }
106  auto max_val = std::numeric_limits<size_t>::max();
107 
108  // store the distance from each node to the given node.
109  std::vector<size_t> vec;
110  vec.resize(tree.node_count(), max_val);
111  vec[ node.index() ] = 0;
112 
113  // calculate the distance vector via levelorder iteration.
114  for( auto it : levelorder( node ) ) {
115  // skip the starting node (it is already set to 0).
116  if (it.is_first_iteration()) {
117  continue;
118  }
119 
120  // we do not have the distance of the current node, but the one of its "parent" (the one in
121  // direction of the starting node)!
122  assert( vec[it.node().index()] == max_val );
123  assert( vec[it.link().outer().node().index()] != max_val );
124 
125  // the distance is the distance from the "parent" node (the next one in direction towards
126  // the given node) plus 1.
127  vec[it.node().index()] = 1 + vec[it.link().outer().node().index()];
128  }
129 
130  return vec;
131 }
132 
133 std::vector<size_t> node_path_length_vector(
134  Tree const& tree
135 ) {
136  return node_path_length_vector( tree, tree.root_node() );
137 }
138 
139 // =================================================================================================
140 // Edge Distance Measures
141 // =================================================================================================
142 
144  Tree const& tree
145 ) {
146  // Result matrix that will be returned.
147  utils::Matrix<size_t> mat (tree.edge_count(), tree.edge_count());
148 
149  // For calculating the distance between edges, we use the distances between nodes and for every
150  // pair of edged find the nodes at the ends of the edges that are closest to each other. This
151  // is then the shortest distance between the two edges.
152  auto node_depth_mat = node_path_length_matrix(tree);
153 
154  #pragma omp parallel for
155  for( size_t ei = 0; ei < tree.edge_count(); ++ei ) {
156  auto const& row_edge = tree.edge_at( ei );
157  assert( row_edge.index() == ei );
158 
159  for( auto const& col_edge : tree.edges() ) {
160 
161  // Set the diagonal element of the matrix. We don't need to compare nodes in this case.
162  if (row_edge.index() == col_edge->index()) {
163  mat(row_edge.index(), row_edge.index()) = 0;
164  continue;
165  }
166 
167  // primary-primary case
168  auto pp = node_depth_mat(
169  row_edge.primary_node().index(),
170  col_edge->primary_node().index()
171  );
172 
173  // primary-secondary case
174  auto ps = node_depth_mat(
175  row_edge.primary_node().index(),
176  col_edge->secondary_node().index()
177  );
178 
179  // secondary-primary case
180  auto sp = node_depth_mat(
181  row_edge.secondary_node().index(),
182  col_edge->primary_node().index()
183  );
184 
185  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
186  // (if this ever happens, the tree is broken).
187  auto dist = std::min({ pp, ps, sp });
188  assert( dist <= node_depth_mat(
189  row_edge.secondary_node().index(),
190  col_edge->secondary_node().index()
191  ));
192 
193  // Store in matrix.
194  mat( row_edge.index(), col_edge->index() ) = dist + 1;
195  }
196  }
197 
198  return mat;
199 }
200 
201 std::vector<size_t> edge_path_length_vector(
202  Tree const& tree,
203  TreeEdge const& edge
204 ) {
205  if( ! belongs_to( tree, edge )) {
206  throw std::runtime_error(
207  "Cannot caluclate node_path_length_vector, as the given Edge does not belong to the Tree."
208  );
209  }
210 
211  auto max_val = std::numeric_limits<size_t>::max();
212  std::vector<size_t> vec( tree.edge_count(), max_val );
213 
214  // We just need two rows of the distance matrix - let's take the vectors instead for speed.
215  auto p_node_dist = node_path_length_vector(tree, edge.primary_node());
216  auto s_node_dist = node_path_length_vector(tree, edge.secondary_node());
217 
218  for( auto const& col_edge : tree.edges() ) {
219 
220  if( edge.index() == col_edge->index() ) {
221  vec[ edge.index() ] = 0;
222  continue;
223  }
224 
225  // primary-primary case
226  double pp = p_node_dist[ col_edge->primary_node().index() ];
227 
228  // primary-secondary case
229  double ps = p_node_dist[ col_edge->secondary_node().index() ];
230 
231  // secondary-primary case
232  double sp = s_node_dist[ col_edge->primary_node().index() ];
233 
234  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
235  // (if this ever happens, the tree is broken).
236  double dist = std::min({ pp, ps, sp });
237  assert(dist <= s_node_dist[ col_edge->secondary_node().index() ]);
238 
239  // Store in vector.
240  vec[ col_edge->index() ] = dist + 1;
241  }
242 
243  return vec;
244 }
245 
246 // =================================================================================================
247 // Complex Distance Methods
248 // =================================================================================================
249 
250 std::vector< std::pair< TreeNode const*, size_t >> closest_leaf_depth_vector (
251  const Tree& tree
252 ) {
253  // prepare a result vector with the size of number of nodes.
254  std::vector< std::pair< TreeNode const*, size_t >> vec;
255  vec.resize(tree.node_count(), {nullptr, 0});
256 
257  // fill the vector for every node.
258  // this could be speed up by doing a postorder traversal followed by some sort of inside-out
259  // traversal (preorder might do the job). but for now, this simple O(n^2) version works, too.
260  for (auto node_it = tree.begin_nodes(); node_it != tree.end_nodes(); ++node_it) {
261  auto node = node_it->get();
262 
263  // we have not visited this node. assertion holds as long as the indices are correct.
264  assert(vec[node->index()].first == nullptr);
265 
266  // look for closest leaf node by doing a levelorder traversal.
267  for( auto it : levelorder( *node ) ) {
268  // if we find a leaf, leave the loop.
269  if (it.node().is_leaf()) {
270  vec[node->index()].first = &it.node();
271  vec[node->index()].second = it.depth();
272  break;
273  }
274  }
275  }
276 
277  return vec;
278 }
279 
280 } // namespace tree
281 } // namespace genesis
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node...
std::vector< size_t > edge_path_length_vector(Tree const &tree, TreeEdge const &edge)
utils::Matrix< size_t > node_path_length_matrix(Tree const &tree)
Return a matrix containing the pairwise depth of all nodes of the tree.
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.cpp:304
Tree operator functions.
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
utils::Range< IteratorLevelorder< TreeLink const, TreeNode const, TreeEdge const > > levelorder(ElementType const &element)
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
utils::Matrix< size_t > edge_path_length_matrix(Tree const &tree)
IteratorNodes end_nodes()
Definition: tree/tree.cpp:469
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree...
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.cpp:101
IteratorNodes begin_nodes()
Definition: tree/tree.cpp:464
Provides easy and fast logging functionality.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.cpp:85
Header of Tree class.
Header of Tree distance methods.