A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/function/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
34 #include "genesis/tree/tree.hpp"
36 
38 
39 #include <algorithm>
40 #include <assert.h>
41 #include <limits>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace tree {
46 
47 // =================================================================================================
48 // Distance Measures
49 // =================================================================================================
50 
60  Tree const& tree
61 ) {
62  auto max_val = std::numeric_limits<size_t>::max();
63  utils::Matrix<size_t> mat( tree.node_count(), tree.node_count(), max_val );
64 
65  // Fill every row of the matrix.
66  for( auto const& row_node : tree.nodes() ) {
67 
68  // Set the diagonal element of the matrix.
69  mat( row_node->index(), row_node->index() ) = 0;
70 
71  // The columns are filled using a levelorder traversal. This makes sure that for every node
72  // we know how to calculate the distance to the current row node.
73  // Unfortunately, this prevents us from simply calculating the upper triangle of the matrix
74  // and copying it (distance is symmetric), because we do not really know which nodes are in
75  // which half during a levelorder traversal...
76  for( auto it : levelorder( row_node->link() )) {
77  // Skip the diagonal of the matrix.
78  if (it.is_first_iteration()) {
79  assert( it.node().index() == row_node->index() );
80  continue;
81  }
82 
83  // Make sure we don't have touched the current position, but have calculated
84  // the needed dependency already.
85  assert( mat(row_node->index(), it.node().index()) == max_val );
86  assert( mat(row_node->index(), it.link().outer().node().index()) != max_val );
87 
88  // The distance to the current row node is one more than the distance from the other
89  // end of that branch to the row node.
90  mat( row_node->index(), it.node().index() )
91  = 1 + mat(row_node->index(), it.link().outer().node().index());
92  }
93  }
94 
95  return mat;
96 }
97 
105 std::vector<size_t> node_path_length_vector(
106  Tree const& tree,
107  TreeNode const& node
108 ) {
109  if( ! belongs_to( tree, node )) {
110  throw std::runtime_error(
111  "Cannot caluclate node_path_length_vector, as the given Node does not belong to the Tree."
112  );
113  }
114  auto max_val = std::numeric_limits<size_t>::max();
115 
116  // store the distance from each node to the given node.
117  std::vector<size_t> vec;
118  vec.resize(tree.node_count(), max_val);
119  vec[ node.index() ] = 0;
120 
121  // calculate the distance vector via levelorder iteration.
122  for( auto it : levelorder( node ) ) {
123  // skip the starting node (it is already set to 0).
124  if (it.is_first_iteration()) {
125  continue;
126  }
127 
128  // we do not have the distance of the current node, but the one of its "parent" (the one in
129  // direction of the starting node)!
130  assert( vec[it.node().index()] == max_val );
131  assert( vec[it.link().outer().node().index()] != max_val );
132 
133  // the distance is the distance from the "parent" node (the next one in direction towards
134  // the given node) plus 1.
135  vec[it.node().index()] = 1 + vec[it.link().outer().node().index()];
136  }
137 
138  return vec;
139 }
140 
148 std::vector<size_t> node_path_length_vector(
149  Tree const& tree
150 ) {
151  return node_path_length_vector( tree, tree.root_node() );
152 }
153 
155  Tree const& tree
156 ) {
157  // Result matrix that will be returned.
158  utils::Matrix<size_t> mat (tree.edge_count(), tree.edge_count());
159 
160  // For calculating the distance between edges, we use the distances between nodes and for every
161  // pair of edged find the nodes at the ends of the edges that are closest to each other. This
162  // is then the shortest distance between the two edges.
163  auto node_depth_mat = node_path_length_matrix(tree);
164 
165  for( auto const& row_edge : tree.edges() ) {
166  for( auto const& col_edge : tree.edges() ) {
167 
168  // Set the diagonal element of the matrix. We don't need to compare nodes in this case.
169  if (row_edge->index() == col_edge->index()) {
170  mat(row_edge->index(), row_edge->index()) = 0;
171  continue;
172  }
173 
174  // primary-primary case
175  auto pp = node_depth_mat(
176  row_edge->primary_node().index(),
177  col_edge->primary_node().index()
178  );
179 
180  // primary-secondary case
181  auto ps = node_depth_mat(
182  row_edge->primary_node().index(),
183  col_edge->secondary_node().index()
184  );
185 
186  // secondary-primary case
187  auto sp = node_depth_mat(
188  row_edge->secondary_node().index(),
189  col_edge->primary_node().index()
190  );
191 
192  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
193  // (if this ever happens, the tree is broken).
194  auto dist = std::min(pp, std::min(ps, sp));
195  assert( dist <= node_depth_mat(
196  row_edge->secondary_node().index(),
197  col_edge->secondary_node().index()
198  ));
199 
200  // Store in matrix.
201  mat( row_edge->index(), col_edge->index() ) = dist + 1;
202  }
203  }
204 
205  return mat;
206 }
207 
208 std::vector<size_t> edge_path_length_vector(
209  Tree const& tree,
210  TreeEdge const& edge
211 ) {
212  if( ! belongs_to( tree, edge )) {
213  throw std::runtime_error(
214  "Cannot caluclate node_path_length_vector, as the given Edge does not belong to the Tree."
215  );
216  }
217 
218  auto max_val = std::numeric_limits<size_t>::max();
219  std::vector<size_t> vec( tree.edge_count(), max_val );
220 
221  // We just need two rows of the distance matrix - let's take the vectors instead for speed.
222  auto p_node_dist = node_path_length_vector(tree, edge.primary_node());
223  auto s_node_dist = node_path_length_vector(tree, edge.secondary_node());
224 
225  for( auto const& col_edge : tree.edges() ) {
226 
227  if( edge.index() == col_edge->index() ) {
228  vec[ edge.index() ] = 0;
229  continue;
230  }
231 
232  // primary-primary case
233  double pp = p_node_dist[ col_edge->primary_node().index() ];
234 
235  // primary-secondary case
236  double ps = p_node_dist[ col_edge->secondary_node().index() ];
237 
238  // secondary-primary case
239  double sp = s_node_dist[ col_edge->primary_node().index() ];
240 
241  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
242  // (if this ever happens, the tree is broken).
243  double dist = std::min(pp, std::min(ps, sp));
244  assert(dist <= s_node_dist[ col_edge->secondary_node().index() ]);
245 
246  // Store in vector.
247  vec[ col_edge->index() ] = dist + 1;
248  }
249 
250  return vec;
251 }
252 
253 // =================================================================================================
254 // Complex Distance Methods
255 // =================================================================================================
256 
273 std::vector< std::pair< TreeNode const*, size_t >> closest_leaf_depth_vector (
274  const Tree& tree
275 ) {
276  // prepare a result vector with the size of number of nodes.
277  std::vector< std::pair< TreeNode const*, size_t >> vec;
278  vec.resize(tree.node_count(), {nullptr, 0});
279 
280  // fill the vector for every node.
281  // this could be speed up by doing a postorder traversal followed by some sort of inside-out
282  // traversal (preorder might do the job). but for now, this simple O(n^2) version works, too.
283  for (auto node_it = tree.begin_nodes(); node_it != tree.end_nodes(); ++node_it) {
284  auto node = node_it->get();
285 
286  // we have not visited this node. assertion holds as long as the indices are correct.
287  assert(vec[node->index()].first == nullptr);
288 
289  // look for closest leaf node by doing a levelorder traversal.
290  for( auto it : levelorder( *node ) ) {
291  // if we find a leaf, leave the loop.
292  if (it.node().is_leaf()) {
293  vec[node->index()].first = &it.node();
294  vec[node->index()].second = it.depth();
295  break;
296  }
297  }
298  }
299 
300  return vec;
301 }
302 
303 } // namespace tree
304 } // namespace genesis
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node...
std::vector< size_t > edge_path_length_vector(Tree const &tree, TreeEdge const &edge)
utils::Matrix< size_t > node_path_length_matrix(Tree const &tree)
Return a matrix containing the pairwise depth of all nodes of the tree.
size_t index() const
Return the index of this Edge.
Definition: edge.cpp:45
Tree operator functions.
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.cpp:484
size_t index() const
Return the index of this Node.
Definition: node.cpp:48
utils::Range< IteratorLevelorder< TreeLink const, TreeNode const, TreeEdge const > > levelorder(ElementType const &element)
utils::Matrix< size_t > edge_path_length_matrix(Tree const &tree)
IteratorNodes end_nodes()
Definition: tree/tree.cpp:469
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree...
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.cpp:101
IteratorNodes begin_nodes()
Definition: tree/tree.cpp:464
Provides easy and fast logging functionality.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.cpp:85
Header of Tree class.
Header of Tree distance methods.