A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/function/distances.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 #include "genesis/tree/tree.hpp"
37 
39 
40 #include <algorithm>
41 #include <cassert>
42 #include <limits>
43 #include <stdexcept>
44 
45 #ifdef GENESIS_OPENMP
46 # include <omp.h>
47 #endif
48 
49 namespace genesis {
50 namespace tree {
51 
52 // =================================================================================================
53 // Node Distance Measures
54 // =================================================================================================
55 
57  Tree const& tree
58 ) {
59  auto max_val = std::numeric_limits<size_t>::max();
60  utils::Matrix<size_t> mat( tree.node_count(), tree.node_count(), max_val );
61 
62  // Fill every row of the matrix.
63  #pragma omp parallel for
64  for( size_t ni = 0; ni < tree.node_count(); ++ni ) {
65  auto const& row_node = tree.node_at( ni );
66  assert( row_node.index() == ni );
67 
68  // Set the diagonal element of the matrix.
69  mat( row_node.index(), row_node.index() ) = 0;
70 
71  // The columns are filled using a levelorder traversal. This makes sure that for every node
72  // we know how to calculate the distance to the current row node.
73  // Unfortunately, this prevents us from simply calculating the upper triangle of the matrix
74  // and copying it (distance is symmetric), because we do not really know which nodes are in
75  // which half during a levelorder traversal...
76  for( auto it : levelorder( row_node.link() )) {
77  // Skip the diagonal of the matrix.
78  if (it.is_first_iteration()) {
79  assert( it.node().index() == row_node.index() );
80  continue;
81  }
82 
83  // Make sure we don't have touched the current position, but have calculated
84  // the needed dependency already.
85  assert( mat(row_node.index(), it.node().index()) == max_val );
86  assert( mat(row_node.index(), it.link().outer().node().index()) != max_val );
87 
88  // The distance to the current row node is one more than the distance from the other
89  // end of that branch to the row node.
90  mat( row_node.index(), it.node().index() )
91  = 1 + mat(row_node.index(), it.link().outer().node().index());
92  }
93  }
94 
95  return mat;
96 }
97 
98 std::vector<size_t> node_path_length_vector(
99  Tree const& tree,
100  TreeNode const& node
101 ) {
102  if( ! belongs_to( tree, node )) {
103  throw std::runtime_error(
104  "Cannot caluclate node_path_length_vector, as the given Node does not belong to the Tree."
105  );
106  }
107  auto max_val = std::numeric_limits<size_t>::max();
108 
109  // store the distance from each node to the given node.
110  std::vector<size_t> vec;
111  vec.resize(tree.node_count(), max_val);
112  vec[ node.index() ] = 0;
113 
114  // calculate the distance vector via levelorder iteration.
115  for( auto it : levelorder( node ) ) {
116  // skip the starting node (it is already set to 0).
117  if (it.is_first_iteration()) {
118  continue;
119  }
120 
121  // we do not have the distance of the current node, but the one of its "parent" (the one in
122  // direction of the starting node)!
123  assert( vec[it.node().index()] == max_val );
124  assert( vec[it.link().outer().node().index()] != max_val );
125 
126  // the distance is the distance from the "parent" node (the next one in direction towards
127  // the given node) plus 1.
128  vec[it.node().index()] = 1 + vec[it.link().outer().node().index()];
129  }
130 
131  return vec;
132 }
133 
134 std::vector<size_t> node_path_length_vector(
135  Tree const& tree
136 ) {
137  return node_path_length_vector( tree, tree.root_node() );
138 }
139 
140 // =================================================================================================
141 // Edge Distance Measures
142 // =================================================================================================
143 
145  Tree const& tree
146 ) {
147  // Result matrix that will be returned.
148  utils::Matrix<size_t> mat (tree.edge_count(), tree.edge_count());
149 
150  // For calculating the distance between edges, we use the distances between nodes and for every
151  // pair of edged find the nodes at the ends of the edges that are closest to each other. This
152  // is then the shortest distance between the two edges.
153  auto node_depth_mat = node_path_length_matrix(tree);
154 
155  #pragma omp parallel for
156  for( size_t ei = 0; ei < tree.edge_count(); ++ei ) {
157  auto const& row_edge = tree.edge_at( ei );
158  assert( row_edge.index() == ei );
159 
160  for( auto const& col_edge : tree.edges() ) {
161 
162  // Set the diagonal element of the matrix. We don't need to compare nodes in this case.
163  if (row_edge.index() == col_edge->index()) {
164  mat(row_edge.index(), row_edge.index()) = 0;
165  continue;
166  }
167 
168  // primary-primary case
169  auto pp = node_depth_mat(
170  row_edge.primary_node().index(),
171  col_edge->primary_node().index()
172  );
173 
174  // primary-secondary case
175  auto ps = node_depth_mat(
176  row_edge.primary_node().index(),
177  col_edge->secondary_node().index()
178  );
179 
180  // secondary-primary case
181  auto sp = node_depth_mat(
182  row_edge.secondary_node().index(),
183  col_edge->primary_node().index()
184  );
185 
186  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
187  // (if this ever happens, the tree is broken).
188  auto dist = std::min({ pp, ps, sp });
189  assert( dist <= node_depth_mat(
190  row_edge.secondary_node().index(),
191  col_edge->secondary_node().index()
192  ));
193 
194  // Store in matrix.
195  mat( row_edge.index(), col_edge->index() ) = dist + 1;
196  }
197  }
198 
199  return mat;
200 }
201 
202 std::vector<size_t> edge_path_length_vector(
203  Tree const& tree,
204  TreeEdge const& edge
205 ) {
206  if( ! belongs_to( tree, edge )) {
207  throw std::runtime_error(
208  "Cannot caluclate node_path_length_vector, as the given Edge does not belong to the Tree."
209  );
210  }
211 
212  auto max_val = std::numeric_limits<size_t>::max();
213  std::vector<size_t> vec( tree.edge_count(), max_val );
214 
215  // We just need two rows of the distance matrix - let's take the vectors instead for speed.
216  auto p_node_dist = node_path_length_vector(tree, edge.primary_node());
217  auto s_node_dist = node_path_length_vector(tree, edge.secondary_node());
218 
219  for( auto const& col_edge : tree.edges() ) {
220 
221  if( edge.index() == col_edge->index() ) {
222  vec[ edge.index() ] = 0;
223  continue;
224  }
225 
226  // primary-primary case
227  double pp = p_node_dist[ col_edge->primary_node().index() ];
228 
229  // primary-secondary case
230  double ps = p_node_dist[ col_edge->secondary_node().index() ];
231 
232  // secondary-primary case
233  double sp = s_node_dist[ col_edge->primary_node().index() ];
234 
235  // Find min. Make sure that the fourth case "secondary-secondary" is not shorter
236  // (if this ever happens, the tree is broken).
237  double dist = std::min({ pp, ps, sp });
238  assert(dist <= s_node_dist[ col_edge->secondary_node().index() ]);
239 
240  // Store in vector.
241  vec[ col_edge->index() ] = dist + 1;
242  }
243 
244  return vec;
245 }
246 
247 // =================================================================================================
248 // Complex Distance Methods
249 // =================================================================================================
250 
251 std::vector< std::pair< TreeNode const*, size_t >> closest_leaf_depth_vector (
252  const Tree& tree
253 ) {
254  // prepare a result vector with the size of number of nodes.
255  std::vector< std::pair< TreeNode const*, size_t >> vec;
256  vec.resize(tree.node_count(), {nullptr, 0});
257 
258  // fill the vector for every node.
259  // this could be speed up by doing a postorder traversal followed by some sort of inside-out
260  // traversal (preorder might do the job). but for now, this simple O(n^2) version works, too.
261  for( auto node_it = tree.begin_nodes(); node_it != tree.end_nodes(); ++node_it ) {
262  auto node = node_it->get();
263 
264  // we have not visited this node. assertion holds as long as the indices are correct.
265  assert(vec[node->index()].first == nullptr);
266 
267  // look for closest leaf node by doing a levelorder traversal.
268  for( auto it : levelorder( *node ) ) {
269  // if we find a leaf, leave the loop.
270  if( is_leaf( it.node() )) {
271  vec[node->index()].first = &it.node();
272  vec[node->index()].second = it.depth();
273  break;
274  }
275  }
276  }
277 
278  return vec;
279 }
280 
281 } // namespace tree
282 } // namespace genesis
std::vector< size_t > node_path_length_vector(Tree const &tree, TreeNode const &node)
Return a vector containing the depth of all nodes with respect to the given start node...
std::vector< size_t > edge_path_length_vector(Tree const &tree, TreeEdge const &edge)
utils::Matrix< size_t > node_path_length_matrix(Tree const &tree)
Return a matrix containing the pairwise depth of all nodes of the tree.
size_t index() const
Return the index of this Edge.
Definition: edge.hpp:105
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.cpp:304
Tree operator functions.
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.hpp:161
size_t index() const
Return the index of this Node.
Definition: node.hpp:100
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
utils::Range< IteratorLevelorder< TreeLink const, TreeNode const, TreeEdge const > > levelorder(ElementType const &element)
std::vector< std::pair< TreeNode const *, size_t > > closest_leaf_depth_vector(const Tree &tree)
Returns a vector containing the closest leaf node for each node, measured in number of edges between ...
utils::Matrix< size_t > edge_path_length_matrix(Tree const &tree)
IteratorNodes end_nodes()
Definition: tree/tree.cpp:469
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.cpp:258
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
TreeNode & primary_node()
Return the TreeNode of this TreeEdge that points towards the root.
Definition: edge.hpp:145
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree...
IteratorNodes begin_nodes()
Definition: tree/tree.cpp:464
Provides easy and fast logging functionality.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
Header of Tree class.
Header of Tree distance methods.