A library for working with phylogenetic and population genetic data.
v0.27.0
rf.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
38 #include <cstdint>
39 #include <algorithm>
40 #include <cassert>
41 #include <stdexcept>
42 #include <limits>
43 
44 #ifdef GENESIS_OPENMP
45 # include <omp.h>
46 #endif
47 
48 namespace genesis {
49 namespace tree {
50 
51 // =================================================================================================
52 // Taxon Name Handling
53 // =================================================================================================
54 
55 std::unordered_map<std::string, size_t> rf_taxon_name_map( Tree const& tree )
56 {
57  std::unordered_map<std::string, size_t> names;
58 
59  // Collect all leaf node names and give them unique indices.
60  // Make sure that inner nodes do not have names, as this might confuse all downstream steps.
61  for( auto const& node : tree.nodes() ) {
62  auto const& name = node.data<CommonNodeData>().name;
63 
64  // Only consider leaf nodes.
65  if( is_leaf( node )) {
66  if( name.empty() ) {
67  throw std::runtime_error(
68  "Cannot calculate RF distnace with empty leaf node names."
69  );
70  }
71  if( names.count( name ) > 0 ) {
72  throw std::runtime_error(
73  "Cannot calculate RF distance with tree that has duplicate node names. "
74  "Name '" + name + "' appears multiple times."
75  );
76  }
77 
78  // We use the current size of the map to get unique indices.
79  auto const idx = names.size();
80  names[ name ] = idx;
81  }
82 
83  // For now, we allow inner node names. They do not matter for our algorithm,
84  // so we just ignore them. However, this might lead to confusion, because the RF distance
85  // needs taxon names to identify nodes, and having names on inner nodes just doesn't make
86  // sense in this context. Still, let's allow it and hope for sane users.
87  // else if( ! name.empty() ) {
88  // throw std::runtime_error(
89  // "Cannot calculate RF distance with tree that has inner node names. "
90  // "Found name '" + name + "' at inner node."
91  // );
92  // }
93  }
94 
95  return names;
96 }
97 
98 // =================================================================================================
99 // Finding Split Bitvectors of Trees
100 // =================================================================================================
101 
109 template<typename BitvectorProcessor>
111  Tree const& tree,
112  std::unordered_map<std::string, size_t> const& names,
113  BitvectorProcessor const& process_bitvector
114 ) {
115  using utils::Bitvector;
116 
117  // Prepare intermediate structure for each edge of the tree,
118  // which keeps track of all Bitvectors of the edges.
119  auto bipartitions = std::vector<Bitvector>( tree.edge_count() );
120 
121  // We also keep track of names: each one needs to appear exactly once!
122  auto name_check = Bitvector( names.size() );
123 
124  // If the tree is rooted, we need to skip one of the two edges next to the root.
125  // They induce the same split, and we do not want to count it twice.
126  // In order to skip one of the two root edges in rooted trees, we store one of their
127  // indices, and skip it later. It does not matter which index we store.
128  // In unrooted trees, we instead store a dummy that never gets skipped.
129  // This could theoretically also happen at inner nodes, if they just have two neighbors,
130  // which is possible, but rare. We ignore it here.
131  size_t root_skip = std::numeric_limits<size_t>::max();
132  if( is_rooted( tree )) {
133  assert( degree( tree.root_node()) == 2 );
134  root_skip = tree.root_node().primary_edge().secondary_node().index();
135  }
136 
137  // Fill bitvectors.
138  for( auto it : postorder(tree) ) {
139  // We want to iterate edges, so skip the last iteration at the root node,
140  // as then we already have processed all edges.
141  if( it.is_last_iteration() ) {
142  continue;
143  }
144 
145  // Also skip one of the root edges if the tree is rooted.
146  if( it.node().index() == root_skip ) {
147  continue;
148  }
149 
150  // If the iterator is at a leaf, just set one bit in the bitvector.
151  if( is_leaf( it.node() ) ) {
152 
153  // Get the index of the name of the leaf.
154  auto const& name = it.node().data<CommonNodeData>().name;
155  auto const nit = names.find( name );
156  if( nit == names.end() ) {
157  throw std::runtime_error(
158  "Cannot calculate RF distance with inconsistent node names. "
159  "Name '" + name + "' is missing from a tree."
160  );
161  }
162 
163  // Check that the name did not appear yet in the tree.
164  if( name_check[ nit->second ] ) {
165  throw std::runtime_error(
166  "Cannot calculate RF distance with tree that has duplicate node names. "
167  "Name '"+ name + "' appears multiple times."
168  );
169  }
170  name_check.set( nit->second );
171 
172  // Store the result in the intermediate structure.
173  // We use a bitvector with just one bit set at the index of the current leaf.
174  // This uniquely identifies this trivial split.
175  auto const eidx = it.edge().index();
176  bipartitions[ eidx ] = Bitvector( names.size() );
177  bipartitions[ eidx ].set( nit->second );
178 
179  // For inner iterator positions, consider the whole subtree below it.
180  } else {
181  auto current = Bitvector( names.size() );
182 
183  // Here, we could test for inner node names.
184  // But as above, we ignore and hence allow them.
185 
186  // We do postorder traversal, so all subtrees of the current node have been processed.
187  // They store trivial splits as single unique bits in their respective bitvectors.
188  // So here, we simply combine them (using or), to get a bitvector of all tips of the
189  // current split. This is not normalized yet, meaning that these bits could stand
190  // for both ways of denoting that split. We later do the needed normalization.
191  auto l = &it.link().next();
192  while( l != &it.link() ) {
193  current |= bipartitions[ l->edge().index() ];
194  l = &l->next();
195  }
196 
197  // Store at the current edge in the intermediate structure.
198  // This needs to be the not yet normalized one, because we are still filling up further
199  // inner nodes, and hence need to maintain these bits as they are.
200  bipartitions[ it.edge().index() ] = current;
201 
202  // Call the bitvector processor functor now, as we just finished constructing a split.
203  // We normalize first to make sure that we always get comparable bitvectors in the end.
204  current.normalize();
205  process_bitvector( current );
206  }
207  }
208 
209  // We have traversed all node names now. If there is still an unset bit in the bitvector,
210  // that means that we did not find all names that are in the tree.
211  if( name_check.count() != names.size() ) {
212  throw std::runtime_error(
213  "Cannot calculate RF distance with trees that have different node names. "
214  "Some names are missing from one of the trees."
215  );
216  }
217 }
218 
219 std::vector<utils::Bitvector> rf_get_bitvectors(
220  Tree const& tree,
221  std::unordered_map<std::string, size_t> const& names
222 ) {
223  using utils::Bitvector;
224 
225  // Prepare result.
226  std::vector<Bitvector> result;
227  result.reserve( inner_edge_count( tree ));
228 
229  // Get all bitvectors and store them in the result.
230  rf_get_bitvectors_template( tree, names, [&]( Bitvector const& bitvec ){
231  result.push_back( bitvec );
232  });
233 
234  return result;
235 }
236 
237 // =================================================================================================
238 // Getting Occurrences of Splits in Trees
239 // =================================================================================================
240 
241 std::unordered_map<utils::Bitvector, utils::Bitvector> rf_get_occurrences( TreeSet const& trees )
242 {
243  using utils::Bitvector;
244 
245  // Map from bitvectors of splits to bitvectors of occurences:
246  // which bitvector (keys of the map) occurs in which tree (values associated with each key).
247  auto result = std::unordered_map<utils::Bitvector, utils::Bitvector>();
248 
249  // Edge case.
250  if( trees.empty() ) {
251  return result;
252  }
253 
254  // Get a unique ID for each taxon name.
255  auto const names = rf_taxon_name_map( trees[0] );
256 
257  #pragma omp parallel for
258  for( size_t i = 0; i < trees.size(); ++i ) {
259 
260  // Get all bitvectors of the tree and add their occurrences to the map.
261  // This way, we do not need to actually store them, but can process them on the fly.
262  // Saves mem and should be faster as well.
263  rf_get_bitvectors_template( trees[i], names, [&]( Bitvector const& bitvec ){
264  #pragma omp critical(GENESIS_BIPARTITION_OCCURRENCES)
265  {
266  if( result.count( bitvec ) == 0 ) {
267  result[bitvec] = Bitvector( trees.size() );
268  }
269  result[bitvec].set( i );
270  }
271  });
272  }
273 
274  return result;
275 }
276 
277 std::unordered_map<utils::Bitvector, utils::Bitvector> rf_get_occurrences(
278  Tree const& lhs,
279  TreeSet const& rhs
280 ) {
281  using utils::Bitvector;
282 
283  // Map from bitvectors of splits to bitvectors of occurences:
284  // which bitvector (keys of the map) occurs in which tree (values assiciated with each key).
285  auto result = std::unordered_map<utils::Bitvector, utils::Bitvector>();
286 
287  // Edge case.
288  if( rhs.empty() ) {
289  return result;
290  }
291 
292  // Get a unique ID for each taxon name.
293  auto const names = rf_taxon_name_map( lhs );
294 
295  // Get the split bitvectors for the lhs tree. We initialize with enough room
296  // for lhs and rhs trees.
297  rf_get_bitvectors_template( lhs, names, [&]( Bitvector const& bitvec ){
298  assert( result.count( bitvec ) == 0 );
299  result[bitvec] = Bitvector( 1 + rhs.size() );
300  result[bitvec].set( 0 );
301  });
302 
303  // Process the rhs trees, and add their split bitvectors.
304  #pragma omp parallel for
305  for( size_t i = 0; i < rhs.size(); ++i ) {
306 
307  rf_get_bitvectors_template( rhs[i], names, [&]( Bitvector const& bitvec ){
308  #pragma omp critical(GENESIS_BIPARTITION_OCCURRENCES)
309  {
310  // We start indexing 1 off, for the lhs tree.
311  if( result.count( bitvec ) == 0 ) {
312  result[bitvec] = utils::Bitvector( 1 + rhs.size() );
313  }
314  result[bitvec].set( 1 + i );
315  }
316  });
317  }
318 
319  return result;
320 }
321 
322 std::unordered_map<utils::Bitvector, utils::Bitvector> rf_get_occurrences(
323  Tree const& lhs,
324  Tree const& rhs
325 ) {
326  using utils::Bitvector;
327 
328  // Map from bitvectors of splits to bitvectors of occurences:
329  // which bitvector (keys of the map) occurs in which tree (values assiciated with each key).
330  auto result = std::unordered_map<utils::Bitvector, utils::Bitvector>();
331 
332  // Get a unique ID for each taxon name.
333  auto const names = rf_taxon_name_map( lhs );
334 
335  // Get the split bitvectors for the lhs tree. We initialize with enough room
336  // for lhs and rhs trees.
337  rf_get_bitvectors_template( lhs, names, [&]( Bitvector const& bitvec ){
338  assert( result.count( bitvec ) == 0 );
339  result[bitvec] = Bitvector( 2 );
340  result[bitvec].set( 0 );
341  });
342 
343  // Do the same for the rhs tree. This time we need to make sure not to overwrite any
344  // existing splits in the map.
345  rf_get_bitvectors_template( rhs, names, [&]( Bitvector const& bitvec ){
346  if( result.count( bitvec ) == 0 ) {
347  result[bitvec] = utils::Bitvector( 2 );
348  }
349  result[bitvec].set( 1 );
350  });
351 
352  return result;
353 }
354 
355 // =================================================================================================
356 // Absolute RF Distance Functions
357 // =================================================================================================
358 
360 {
361  using utils::Matrix;
362 
363  auto result = Matrix<size_t>( trees.size(), trees.size(), 0 );
364  auto const hash_occs = rf_get_occurrences( trees );
365 
366  // We test every split that occurred in the trees.
367  for( auto const& hash_occ : hash_occs ) {
368 
369  // Go through all trees and see if it appeared in them.
370  for( size_t i = 0; i < trees.size(); ++i ) {
371  if( hash_occ.second[i] == false ) {
372  continue;
373  }
374 
375  // If we are here, we have a split that occured in tree i.
376  // Now we check, if it also appeared in tree j (for all j!=i).
377  // If not, we have a split that is part of i but not of j,
378  // so it adds to their pairwise distance.
379  for( size_t j = 0; j < trees.size(); ++j ) {
380  if( i == j ) {
381  continue;
382  }
383  if( hash_occ.second[j] == false ) {
384  ++result( i, j );
385  ++result( j, i );
386  }
387  }
388  }
389  }
390 
391  return result;
392 }
393 
394 std::vector<size_t> rf_distance_absolute( Tree const& lhs, TreeSet const& rhs )
395 {
396  auto result = std::vector<size_t>( rhs.size(), 0 );
397  auto const hash_occs = rf_get_occurrences( lhs, rhs );
398 
399  // We test every split that occurred in all of the trees.
400  for( auto const& hash_occ : hash_occs ) {
401  // See if it was in the lhs tree.
402  bool const in_lhs = hash_occ.second[0];
403 
404  // Now go through all rhs trees and see if it also appeared there.
405  for( size_t i = 0; i < rhs.size(); ++i ) {
406  bool const in_rhs = hash_occ.second[ 1 + i ];
407 
408  // Now, in_lhs and in_rhs indicate in which of the trees the split appeared.
409  // It adds to the distance between lhs and rhs[i] only if those two differ,
410  // which is the case if in_lhs ^ in_rhs == true.
411 
412  // Bool converts to 1, so we can use this to process without branching.
413  result[i] += (in_lhs ^ in_rhs);
414 
415  // Just in case, this is equivalent, but needs branching:
416  // if( in_lhs ^ in_rhs ) {
417  // ++result[i];
418  // }
419  }
420  }
421 
422  return result;
423 }
424 
425 size_t rf_distance_absolute( Tree const& lhs, Tree const& rhs )
426 {
427  size_t result = 0;
428 
429  // Get a map of all splits that appear in the two trees to a bitvector of size two
430  // indicating in which of the trees the split appeared.
431  auto const hash_occs = rf_get_occurrences( lhs, rhs );
432 
433  // We test every split that occurred in all of the trees.
434  for( auto const& hash_occ : hash_occs ) {
435  assert( hash_occ.second.size() == 2 );
436 
437  // See if it was in the lhs tree, and rhs tree, respectively.
438  bool const in_lhs = hash_occ.second[ 0 ];
439  bool const in_rhs = hash_occ.second[ 1 ];
440 
441  // At least one of them needs to be set, otherwise the split should not have ended
442  // up in the split list in the first place.
443  assert( in_lhs || in_rhs );
444 
445  // Now, in_lhs and in_rhs indicate in which of the trees the split appeared.
446  // It adds to the distance between lhs and rhs[i] only if those two differ,
447  // which is the case if in_lhs ^ in_rhs == true.
448 
449  // Bool converts to 1, so we can use this to process without branching.
450  result += (in_lhs ^ in_rhs);
451  }
452 
453  return result;
454 }
455 
456 // =================================================================================================
457 // Relative RF Distance Functions
458 // =================================================================================================
459 
461 {
462  // Prepare result.
463  auto result = utils::Matrix<double>( trees.size(), trees.size() );
464  if( trees.size() == 0 ) {
465  return result;
466  }
467 
468  // Compute abs rf dist.
469  auto const rf = rf_distance_absolute( trees );
470  assert( rf.rows() == trees.size() );
471  assert( rf.cols() == trees.size() );
472 
473  // Get norm factor.
474  assert( trees.size() > 0 );
475  if( trees[0].node_count() < 3 ) {
476  throw std::runtime_error(
477  "Cannot compute relative RF distance for trees with less than 3 taxa."
478  );
479  }
480  double const norm = 2.0 * static_cast<double>( trees[0].node_count() - 3 );
481 
482  // Compute matrix.
483  #pragma omp parallel for
484  for( size_t i = 0; i < rf.rows(); ++i ) {
485  for( size_t j = 0; j < rf.cols(); ++j ) {
486  result( i, j ) = static_cast<double>( rf( i, j )) / norm;
487  }
488  }
489 
490  return result;
491 }
492 
493 std::vector<double> rf_distance_relative( Tree const& lhs, TreeSet const& rhs )
494 {
495  // Prepare result.
496  auto result = std::vector<double>( rhs.size() );
497 
498  // Compute abs rf dist.
499  auto const rf = rf_distance_absolute( lhs, rhs );
500  assert( rf.size() == rhs.size() );
501 
502  // Get norm factor.
503  if( lhs.node_count() < 3 ) {
504  throw std::runtime_error(
505  "Cannot compute relative RF distance for trees with less than 3 taxa."
506  );
507  }
508  double const norm = 2.0 * static_cast<double>( lhs.node_count() - 3 );
509 
510  // Compute vector.
511  #pragma omp parallel for
512  for( size_t i = 0; i < rf.size(); ++i ) {
513  result[i] = static_cast<double>( rf[i] ) / norm;
514  }
515 
516  return result;
517 }
518 
519 double rf_distance_relative( Tree const& lhs, Tree const& rhs )
520 {
521  // Compute abs rf dist.
522  auto const rf = rf_distance_absolute( lhs, rhs );
523 
524  // Get norm factor.
525  if( lhs.node_count() < 3 ) {
526  throw std::runtime_error(
527  "Cannot compute relative RF distance for trees with less than 3 taxa."
528  );
529  }
530  double const norm = 2.0 * static_cast<double>( lhs.node_count() - 3 );
531 
532  return static_cast<double>( rf ) / norm;
533 }
534 
535 } // namespace tree
536 } // namespace genesis
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::tree::Tree::node_count
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
genesis::tree::rf_get_bitvectors_template
void rf_get_bitvectors_template(Tree const &tree, std::unordered_map< std::string, size_t > const &names, BitvectorProcessor const &process_bitvector)
Local helper function template that constructs all bitvectors for the splits of a tree,...
Definition: rf.cpp:110
genesis::tree::rf_get_bitvectors
std::vector< utils::Bitvector > rf_get_bitvectors(Tree const &tree, std::unordered_map< std::string, size_t > const &names)
Get all split Bitvectors for a given Tree.
Definition: rf.cpp:219
genesis::tree::rf_get_occurrences
std::unordered_map< utils::Bitvector, utils::Bitvector > rf_get_occurrences(TreeSet const &trees)
Get an occurrence map for each split found in the given TreeSet.
Definition: rf.cpp:241
genesis::tree::TreeSet::size
size_t size() const
Return the size of the TreeSet, i.e., the number of stored Trees.
Definition: tree_set.hpp:228
functions.hpp
genesis::tree::degree
size_t degree(TreeLink const &link)
Return the degree of the node for a given TreeLink, i.e. how many neighbouring nodes it has.
Definition: tree/function/functions.cpp:103
genesis::tree::Tree::root_node
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.hpp:300
genesis::tree::TreeEdge::secondary_node
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.hpp:162
genesis::utils::Matrix< size_t >
genesis::tree::TreeSet
Definition: tree_set.hpp:48
genesis::tree::rf_distance_absolute
utils::Matrix< size_t > rf_distance_absolute(TreeSet const &trees)
Compute the pairwise absolute RF (Robinson-Foulds) distance metric between a set of trees.
Definition: rf.cpp:359
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::tree::node_count
size_t node_count(Tree const &tree)
Return the number of Nodes of a Tree. Same as Tree::node_count().
Definition: tree/function/functions.cpp:185
genesis::tree::rf_taxon_name_map
std::unordered_map< std::string, size_t > rf_taxon_name_map(Tree const &tree)
Get a mapping from taxon names to unique IDs.
Definition: rf.cpp:55
genesis::tree::rf_distance_relative
utils::Matrix< double > rf_distance_relative(TreeSet const &trees)
Compute the pairwise relative RF (Robinson-Foulds) distance metric between a set of trees.
Definition: rf.cpp:460
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::inner_edge_count
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
Definition: tree/function/functions.cpp:201
rf.hpp
genesis::tree::Tree::nodes
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
tree.hpp
genesis::tree::is_rooted
bool is_rooted(Tree const &tree)
Return whether the Tree is rooted, that is, whether the root node has two neighbors.
Definition: tree/function/functions.cpp:163
genesis::tree::TreeNode::primary_edge
TreeEdge & primary_edge()
Return the TreeEdge that points towards the root.
Definition: tree/tree/node.hpp:148
genesis::tree::CommonNodeData
Common class containing the commonly needed data for tree nodes.
Definition: tree/common_tree/tree.hpp:79
genesis::tree::TreeNode::index
size_t index() const
Return the index of this Node.
Definition: tree/tree/node.hpp:102
postorder.hpp
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
genesis::tree::TreeSet::empty
bool empty() const
Return whether the TreeSet is empty.
Definition: tree_set.hpp:219
genesis::tree::postorder
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
Definition: tree/iterator/postorder.hpp:320
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
genesis::utils::Bitvector
Definition: bitvector.hpp:48