A toolkit for working with phylogenetic data.
v0.24.0
rf.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
38 #include <cstdint>
39 #include <algorithm>
40 #include <cassert>
41 #include <stdexcept>
42 
43 #ifdef GENESIS_OPENMP
44 # include <omp.h>
45 #endif
46 
47 namespace genesis {
48 namespace tree {
49 
50 // =================================================================================================
51 // Taxon Name Handling
52 // =================================================================================================
53 
54 std::unordered_map<std::string, size_t> rf_taxon_name_map( Tree const& tree )
55 {
56  std::unordered_map<std::string, size_t> names;
57 
58  // Collect all leaf node names and give them unique indices.
59  // Make sure that inner nodes do not have names, as this might confuse all downstream steps.
60  for( auto const& node : tree.nodes() ) {
61  auto const& name = node.data<CommonNodeData>().name;
62 
63  // Only consider leaf nodes.
64  if( is_leaf( node )) {
65  if( name.empty() ) {
66  throw std::runtime_error(
67  "Cannot calculate RF distnace with empty leaf node names."
68  );
69  }
70  if( names.count( name ) > 0 ) {
71  throw std::runtime_error(
72  "Cannot calculate RF distance with tree that has duplicate node names. "
73  "Name '" + name + "' appears multiple times."
74  );
75  }
76 
77  // We use the current size of the map to get unique indices.
78  auto const idx = names.size();
79  names[ name ] = idx;
80  }
81 
82  // For now, we allow inner node names. They do not matter for our algorithm,
83  // so we just ignore them. However, this might lead to confusion, because the RF distance
84  // needs taxon names to identify nodes, and having names on inner nodes just doesn't make
85  // sense in this context. Still, let's allow it and hope for sane users.
86  // else if( ! name.empty() ) {
87  // throw std::runtime_error(
88  // "Cannot calculate RF distance with tree that has inner node names. "
89  // "Found name '" + name + "' at inner node."
90  // );
91  // }
92  }
93 
94  return names;
95 }
96 
97 // =================================================================================================
98 // Finding Split Bitvectors of Trees
99 // =================================================================================================
100 
108 template<typename BitvectorProcessor>
110  Tree const& tree,
111  std::unordered_map<std::string, size_t> const& names,
112  BitvectorProcessor const& process_bitvector
113 ) {
114  using utils::Bitvector;
115 
116  // Prepare intermediate structure for each edge of the tree,
117  // which keeps track of all Bitvectors of the edges.
118  auto bipartitions = std::vector<Bitvector>( tree.edge_count() );
119 
120  // We also keep track of names: each one needs to appear exactly once!
121  auto name_check = Bitvector( names.size() );
122 
123  // If the tree is rooted, we need to skip one of the two edges next to the root.
124  // They induce the same split, and we do not want to count it twice.
125  // In order to skip one of the two root edges in rooted trees, we store one of their
126  // indices, and skip it later. It does not matter which index we store.
127  // In unrooted trees, we instead store a dummy that never gets skipped.
128  // This could theoretically also happen at inner nodes, if they just have two neighbors,
129  // which is possible, but rare. We ignore it here.
130  size_t root_skip = std::numeric_limits<size_t>::max();
131  if( is_rooted( tree )) {
132  assert( degree( tree.root_node()) == 2 );
133  root_skip = tree.root_node().primary_edge().secondary_node().index();
134  }
135 
136  // Fill bitvectors.
137  for( auto it : postorder(tree) ) {
138  // We want to iterate edges, so skip the last iteration at the root node,
139  // as then we already have processed all edges.
140  if( it.is_last_iteration() ) {
141  continue;
142  }
143 
144  // Also skip one of the root edges if the tree is rooted.
145  if( it.node().index() == root_skip ) {
146  continue;
147  }
148 
149  // If the iterator is at a leaf, just set one bit in the bitvector.
150  if( is_leaf( it.node() ) ) {
151 
152  // Get the index of the name of the leaf.
153  auto const& name = it.node().data<CommonNodeData>().name;
154  auto const nit = names.find( name );
155  if( nit == names.end() ) {
156  throw std::runtime_error(
157  "Cannot calculate RF distance with inconsistent node names. "
158  "Name '" + name + "' is missing from a tree."
159  );
160  }
161 
162  // Check that the name did not appear yet in the tree.
163  if( name_check[ nit->second ] ) {
164  throw std::runtime_error(
165  "Cannot calculate RF distance with tree that has duplicate node names. "
166  "Name '"+ name + "' appears multiple times."
167  );
168  }
169  name_check.set( nit->second );
170 
171  // Store the result in the intermediate structure.
172  // We use a bitvector with just one bit set at the index of the current leaf.
173  // This uniquely identifies this trivial split.
174  auto const eidx = it.edge().index();
175  bipartitions[ eidx ] = Bitvector( names.size() );
176  bipartitions[ eidx ].set( nit->second );
177 
178  // For inner iterator positions, consider the whole subtree below it.
179  } else {
180  auto current = Bitvector( names.size() );
181 
182  // Here, we could test for inner node names.
183  // But as above, we ignore and hence allow them.
184 
185  // We do postorder traversal, so all subtrees of the current node have been processed.
186  // They store trivial splits as single unique bits in their respective bitvectors.
187  // So here, we simply combine them (using or), to get a bitvector of all tips of the
188  // current split. This is not normalized yet, meaning that these bits could stand
189  // for both ways of denoting that split. We later do the needed normalization.
190  auto l = &it.link().next();
191  while( l != &it.link() ) {
192  current |= bipartitions[ l->edge().index() ];
193  l = &l->next();
194  }
195 
196  // Store at the current edge in the intermediate structure.
197  // This needs to be the not yet normalized one, because we are still filling up further
198  // inner nodes, and hence need to maintain these bits as they are.
199  bipartitions[ it.edge().index() ] = current;
200 
201  // Call the bitvector processor functor now, as we just finished constructing a split.
202  // We normalize first to make sure that we always get comparable bitvectors in the end.
203  current.normalize();
204  process_bitvector( current );
205  }
206  }
207 
208  // We have traversed all node names now. If there is still an unset bit in the bitvector,
209  // that means that we did not find all names that are in the tree.
210  if( name_check.count() != names.size() ) {
211  throw std::runtime_error(
212  "Cannot calculate RF distance with trees that have different node names. "
213  "Some names are missing from one of the trees."
214  );
215  }
216 }
217 
218 std::vector<utils::Bitvector> rf_get_bitvectors(
219  Tree const& tree,
220  std::unordered_map<std::string, size_t> const& names
221 ) {
222  using utils::Bitvector;
223 
224  // Prepare result.
225  std::vector<Bitvector> result;
226  result.reserve( inner_edge_count( tree ));
227 
228  // Get all bitvectors and store them in the result.
229  rf_get_bitvectors_template( tree, names, [&]( Bitvector const& bitvec ){
230  result.push_back( bitvec );
231  });
232 
233  return result;
234 }
235 
236 // =================================================================================================
237 // Getting Occurrences of Splits in Trees
238 // =================================================================================================
239 
240 std::unordered_map<utils::Bitvector, utils::Bitvector> rf_get_occurrences( TreeSet const& trees )
241 {
242  using utils::Bitvector;
243 
244  // Map from bitvectors of splits to bitvectors of occurences:
245  // which bitvector (keys of the map) occurs in which tree (values associated with each key).
246  auto result = std::unordered_map<utils::Bitvector, utils::Bitvector>();
247 
248  // Edge case.
249  if( trees.empty() ) {
250  return result;
251  }
252 
253  // Get a unique ID for each taxon name.
254  auto const names = rf_taxon_name_map( trees[0] );
255 
256  #pragma omp parallel for
257  for( size_t i = 0; i < trees.size(); ++i ) {
258 
259  // Get all bitvectors of the tree and add their occurrences to the map.
260  // This way, we do not need to actually store them, but can process them on the fly.
261  // Saves mem and should be faster as well.
262  rf_get_bitvectors_template( trees[i], names, [&]( Bitvector const& bitvec ){
263  #pragma omp critical(GENESIS_BIPARTITION_OCCURRENCES)
264  {
265  if( result.count( bitvec ) == 0 ) {
266  result[bitvec] = Bitvector( trees.size() );
267  }
268  result[bitvec].set( i );
269  }
270  });
271  }
272 
273  return result;
274 }
275 
276 std::unordered_map<utils::Bitvector, utils::Bitvector> rf_get_occurrences(
277  Tree const& lhs,
278  TreeSet const& rhs
279 ) {
280  using utils::Bitvector;
281 
282  // Map from bitvectors of splits to bitvectors of occurences:
283  // which bitvector (keys of the map) occurs in which tree (values assiciated with each key).
284  auto result = std::unordered_map<utils::Bitvector, utils::Bitvector>();
285 
286  // Edge case.
287  if( rhs.empty() ) {
288  return result;
289  }
290 
291  // Get a unique ID for each taxon name.
292  auto const names = rf_taxon_name_map( lhs );
293 
294  // Get the split bitvectors for the lhs tree. We initialize with enough room
295  // for lhs and rhs trees.
296  rf_get_bitvectors_template( lhs, names, [&]( Bitvector const& bitvec ){
297  assert( result.count( bitvec ) == 0 );
298  result[bitvec] = Bitvector( 1 + rhs.size() );
299  result[bitvec].set( 0 );
300  });
301 
302  // Process the rhs trees, and add their split bitvectors.
303  #pragma omp parallel for
304  for( size_t i = 0; i < rhs.size(); ++i ) {
305 
306  rf_get_bitvectors_template( rhs[i], names, [&]( Bitvector const& bitvec ){
307  #pragma omp critical(GENESIS_BIPARTITION_OCCURRENCES)
308  {
309  // We start indexing 1 off, for the lhs tree.
310  if( result.count( bitvec ) == 0 ) {
311  result[bitvec] = utils::Bitvector( 1 + rhs.size() );
312  }
313  result[bitvec].set( 1 + i );
314  }
315  });
316  }
317 
318  return result;
319 }
320 
321 std::unordered_map<utils::Bitvector, utils::Bitvector> rf_get_occurrences(
322  Tree const& lhs,
323  Tree const& rhs
324 ) {
325  using utils::Bitvector;
326 
327  // Map from bitvectors of splits to bitvectors of occurences:
328  // which bitvector (keys of the map) occurs in which tree (values assiciated with each key).
329  auto result = std::unordered_map<utils::Bitvector, utils::Bitvector>();
330 
331  // Get a unique ID for each taxon name.
332  auto const names = rf_taxon_name_map( lhs );
333 
334  // Get the split bitvectors for the lhs tree. We initialize with enough room
335  // for lhs and rhs trees.
336  rf_get_bitvectors_template( lhs, names, [&]( Bitvector const& bitvec ){
337  assert( result.count( bitvec ) == 0 );
338  result[bitvec] = Bitvector( 2 );
339  result[bitvec].set( 0 );
340  });
341 
342  // Do the same for the rhs tree. This time we need to make sure not to overwrite any
343  // existing splits in the map.
344  rf_get_bitvectors_template( rhs, names, [&]( Bitvector const& bitvec ){
345  if( result.count( bitvec ) == 0 ) {
346  result[bitvec] = utils::Bitvector( 2 );
347  }
348  result[bitvec].set( 1 );
349  });
350 
351  return result;
352 }
353 
354 // =================================================================================================
355 // Absolute RF Distance Functions
356 // =================================================================================================
357 
359 {
360  using utils::Matrix;
361 
362  auto result = Matrix<size_t>( trees.size(), trees.size(), 0 );
363  auto const hash_occs = rf_get_occurrences( trees );
364 
365  // We test every split that occurred in the trees.
366  for( auto const& hash_occ : hash_occs ) {
367 
368  // Go through all trees and see if it appeared in them.
369  for( size_t i = 0; i < trees.size(); ++i ) {
370  if( hash_occ.second[i] == false ) {
371  continue;
372  }
373 
374  // If we are here, we have a split that occured in tree i.
375  // Now we check, if it also appeared in tree j (for all j!=i).
376  // If not, we have a split that is part of i but not of j,
377  // so it adds to their pairwise distance.
378  for( size_t j = 0; j < trees.size(); ++j ) {
379  if( i == j ) {
380  continue;
381  }
382  if( hash_occ.second[j] == false ) {
383  ++result( i, j );
384  ++result( j, i );
385  }
386  }
387  }
388  }
389 
390  return result;
391 }
392 
393 std::vector<size_t> rf_distance_absolute( Tree const& lhs, TreeSet const& rhs )
394 {
395  auto result = std::vector<size_t>( rhs.size(), 0 );
396  auto const hash_occs = rf_get_occurrences( lhs, rhs );
397 
398  // We test every split that occurred in all of the trees.
399  for( auto const& hash_occ : hash_occs ) {
400  // See if it was in the lhs tree.
401  bool const in_lhs = hash_occ.second[0];
402 
403  // Now go through all rhs trees and see if it also appeared there.
404  for( size_t i = 0; i < rhs.size(); ++i ) {
405  bool const in_rhs = hash_occ.second[ 1 + i ];
406 
407  // Now, in_lhs and in_rhs indicate in which of the trees the split appeared.
408  // It adds to the distance between lhs and rhs[i] only if those two differ,
409  // which is the case if in_lhs ^ in_rhs == true.
410 
411  // Bool converts to 1, so we can use this to process without branching.
412  result[i] += (in_lhs ^ in_rhs);
413 
414  // Just in case, this is equivalent, but needs branching:
415  // if( in_lhs ^ in_rhs ) {
416  // ++result[i];
417  // }
418  }
419  }
420 
421  return result;
422 }
423 
424 size_t rf_distance_absolute( Tree const& lhs, Tree const& rhs )
425 {
426  size_t result = 0;
427 
428  // Get a map of all splits that appear in the two trees to a bitvector of size two
429  // indicating in which of the trees the split appeared.
430  auto const hash_occs = rf_get_occurrences( lhs, rhs );
431 
432  // We test every split that occurred in all of the trees.
433  for( auto const& hash_occ : hash_occs ) {
434  assert( hash_occ.second.size() == 2 );
435 
436  // See if it was in the lhs tree, and rhs tree, respectively.
437  bool const in_lhs = hash_occ.second[ 0 ];
438  bool const in_rhs = hash_occ.second[ 1 ];
439 
440  // At least one of them needs to be set, otherwise the split should not have ended
441  // up in the split list in the first place.
442  assert( in_lhs || in_rhs );
443 
444  // Now, in_lhs and in_rhs indicate in which of the trees the split appeared.
445  // It adds to the distance between lhs and rhs[i] only if those two differ,
446  // which is the case if in_lhs ^ in_rhs == true.
447 
448  // Bool converts to 1, so we can use this to process without branching.
449  result += (in_lhs ^ in_rhs);
450  }
451 
452  return result;
453 }
454 
455 // =================================================================================================
456 // Relative RF Distance Functions
457 // =================================================================================================
458 
460 {
461  // Prepare result.
462  auto result = utils::Matrix<double>( trees.size(), trees.size() );
463  if( trees.size() == 0 ) {
464  return result;
465  }
466 
467  // Compute abs rf dist.
468  auto const rf = rf_distance_absolute( trees );
469  assert( rf.rows() == trees.size() );
470  assert( rf.cols() == trees.size() );
471 
472  // Get norm factor.
473  assert( trees.size() > 0 );
474  if( trees[0].node_count() < 3 ) {
475  throw std::runtime_error(
476  "Cannot compute relative RF distance for trees with less than 3 taxa."
477  );
478  }
479  double const norm = 2.0 * static_cast<double>( trees[0].node_count() - 3 );
480 
481  // Compute matrix.
482  #pragma omp parallel for
483  for( size_t i = 0; i < rf.rows(); ++i ) {
484  for( size_t j = 0; j < rf.cols(); ++j ) {
485  result( i, j ) = static_cast<double>( rf( i, j )) / norm;
486  }
487  }
488 
489  return result;
490 }
491 
492 std::vector<double> rf_distance_relative( Tree const& lhs, TreeSet const& rhs )
493 {
494  // Prepare result.
495  auto result = std::vector<double>( rhs.size() );
496 
497  // Compute abs rf dist.
498  auto const rf = rf_distance_absolute( lhs, rhs );
499  assert( rf.size() == rhs.size() );
500 
501  // Get norm factor.
502  if( lhs.node_count() < 3 ) {
503  throw std::runtime_error(
504  "Cannot compute relative RF distance for trees with less than 3 taxa."
505  );
506  }
507  double const norm = 2.0 * static_cast<double>( lhs.node_count() - 3 );
508 
509  // Compute vector.
510  #pragma omp parallel for
511  for( size_t i = 0; i < rf.size(); ++i ) {
512  result[i] = static_cast<double>( rf[i] ) / norm;
513  }
514 
515  return result;
516 }
517 
518 double rf_distance_relative( Tree const& lhs, Tree const& rhs )
519 {
520  // Compute abs rf dist.
521  auto const rf = rf_distance_absolute( lhs, rhs );
522 
523  // Get norm factor.
524  if( lhs.node_count() < 3 ) {
525  throw std::runtime_error(
526  "Cannot compute relative RF distance for trees with less than 3 taxa."
527  );
528  }
529  double const norm = 2.0 * static_cast<double>( lhs.node_count() - 3 );
530 
531  return static_cast<double>( rf ) / norm;
532 }
533 
534 } // namespace tree
535 } // namespace genesis
bool empty() const
Return whether the TreeSet is empty.
Definition: tree_set.hpp:219
bool is_rooted(Tree const &tree)
Return whether the Tree is rooted, that is, whether the root node has two neighbors.
Provides some valuable algorithms that are not part of the C++ 11 STL.
utils::Matrix< double > rf_distance_relative(TreeSet const &trees)
Compute the pairwise relative RF (Robinson-Foulds) distance metric between a set of trees...
Definition: rf.cpp:459
size_t inner_edge_count(Tree const &tree)
Return the number of Edges of a Tree that do not lead to a leaf Node.
size_t size() const
Return the size of the TreeSet, i.e., the number of stored Trees.
Definition: tree_set.hpp:228
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
size_t node_count(Tree const &tree)
Return the number of Nodes of a Tree. Same as Tree::node_count().
void set(size_t index)
Set the value of a single bit to true, with boundary check.
Definition: bitvector.hpp:147
TreeNode & secondary_node()
Return the TreeNode of this TreeEdge that points away from the root.
Definition: edge.hpp:162
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
utils::Matrix< size_t > rf_distance_absolute(TreeSet const &trees)
Compute the pairwise absolute RF (Robinson-Foulds) distance metric between a set of trees...
Definition: rf.cpp:358
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
std::vector< utils::Bitvector > rf_get_bitvectors(Tree const &tree, std::unordered_map< std::string, size_t > const &names)
Get all split Bitvectors for a given Tree.
Definition: rf.cpp:218
void rf_get_bitvectors_template(Tree const &tree, std::unordered_map< std::string, size_t > const &names, BitvectorProcessor const &process_bitvector)
Local helper function template that constructs all bitvectors for the splits of a tree...
Definition: rf.cpp:109
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
Common class containing the commonly needed data for tree nodes.
TreeNode & root_node()
Return the TreeNode at the current root of the Tree.
Definition: tree/tree.hpp:300
size_t index() const
Return the index of this Node.
Definition: node.hpp:102
size_t degree(TreeLink const &link)
Return the degree of the node for a given TreeLink, i.e. how many neighbouring nodes it has...
std::unordered_map< std::string, size_t > rf_taxon_name_map(Tree const &tree)
Get a mapping from taxon names to unique IDs.
Definition: rf.cpp:54
TreeEdge & primary_edge()
Return the TreeEdge that points towards the root.
Definition: node.hpp:148
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
std::unordered_map< utils::Bitvector, utils::Bitvector > rf_get_occurrences(TreeSet const &trees)
Get an occurrence map for each split found in the given TreeSet.
Definition: rf.cpp:240