A toolkit for working with phylogenetic data.
v0.24.0
tree/bipartition/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 #include "genesis/tree/tree.hpp"
45 
46 #include <cassert>
47 #include <limits>
48 #include <stdexcept>
49 #include <string>
50 
51 namespace genesis {
52 namespace tree {
53 
54 // =================================================================================================
55 // Bipartition Helper Functions
56 // =================================================================================================
57 
58 std::vector<Bipartition> bipartition_set( Tree const& tree )
59 {
60  // Result
61  const size_t num_leaves = leaf_node_count( tree );
62  auto bipartitions = std::vector<Bipartition>( tree.edge_count() );
63 
64  auto const node_to_leafs = node_to_leaf_map( tree );
65 
66  // Fill bitvectors
67  for( auto it : postorder(tree) ) {
68  if (it.is_last_iteration()) {
69  continue;
70  }
71 
72  auto bp = Bipartition( it.link(), utils::Bitvector( num_leaves ));
73 
74  // If the iterator is at a leaf, just set one bit in the bitvector.
75  if( is_leaf( it.node() ) ) {
76  auto const leaf_idx = node_to_leafs[ it.node().index() ];
77  assert( leaf_idx != std::numeric_limits<std::size_t>::max() );
78  bp.bitvector().set(leaf_idx);
79 
80  // For inner iterator positions, consider the whole subtree below it.
81  } else {
82  auto l = &it.link().next();
83  while( l != &it.link() ) {
84  assert( ! bipartitions[ l->edge().index() ].empty() );
85  bp.bitvector() |= bipartitions[ l->edge().index() ].leaf_nodes();
86  l = &l->next();
87  }
88  }
89 
90  // Store at the current edge
91  assert( bipartitions[ it.edge().index() ].empty() );
92  bipartitions[ it.edge().index() ] = bp;
93  }
94 
95  // Make sure all bips are filled.
96  assert(
97  [ &bipartitions ](){
98  for( auto const& bip : bipartitions ) {
99  if( bip.empty() ) {
100  return false;
101  }
102  }
103  return true;
104  }()
105  );
106 
107  return bipartitions;
108 }
109 
110 std::vector<size_t> node_to_leaf_map( Tree const& tree )
111 {
112  // Currently, we fix this so that leaves are always ordered.
113  // This is a bit slower, but better when working with multiple trees.
114  // In the future, this whole interface could be refined...
115  bool const sort_leaves = true;
116 
117  // Make an lookup of consecutive leaf nodes.
118  std::vector<size_t> nodes_to_leafs;
119  nodes_to_leafs.resize( tree.node_count() );
120 
121  // Find node string order if needed.
122  std::vector<size_t> name_order;
123  if( sort_leaves ) {
124 
125  // Make a sorted list of leave node names.
126  std::vector<std::string> node_names;
127  for( auto const& node : tree.nodes() ) {
128  if( is_leaf( node )) {
129  node_names.push_back( node.data<CommonNodeData>().name );
130  }
131  }
132 
133  // Get an order mapping list, that gives us the n-th index according to order.
134  // That is, it maps order to indices.
135  auto const ordered = utils::sort_indices( node_names.begin(), node_names.end() );
136 
137  // Check for duplicate names.
138  for( size_t i = 1; i < ordered.size(); ++i ) {
139  if( node_names[ ordered[ i ]] == node_names[ ordered[ i - 1 ]] ) {
140  throw std::runtime_error(
141  "Cannot build bipartitions for Tree that has duplicate node names."
142  );
143  }
144  }
145 
146  // Reverse the mapping, so that we get a lookup from index to order.
147  name_order.resize( ordered.size() );
148  for( size_t i = 0; i < ordered.size(); ++i ) {
149  name_order[ ordered[i] ] = i;
150  }
151  }
152 
153  // Assign indices to each node.
154  size_t leaf_idx = 0;
155  for( auto const& node : tree.nodes() ) {
156  if( is_leaf( node )) {
157  if( sort_leaves ) {
158 
159  // Use the index to order map to get the ordered leaf index.
160  assert( leaf_idx < name_order.size() );
161  nodes_to_leafs[ node.index() ] = name_order[ leaf_idx ];
162  } else {
163 
164  // Unsorted case: just count upwards.
165  nodes_to_leafs[ node.index() ] = leaf_idx;
166  }
167  ++leaf_idx;
168  } else {
169  nodes_to_leafs[ node.index() ] = std::numeric_limits<std::size_t>::max();
170  }
171  }
172 
173  return nodes_to_leafs;
174 }
175 
176 utils::Bitvector leaf_node_bitvector( Tree const& tree, std::vector<TreeNode const*> leaf_nodes )
177 {
178  auto const node_to_leafs = node_to_leaf_map( tree );
179  utils::Bitvector result( leaf_node_count( tree ));
180  for( auto n : leaf_nodes ) {
181  auto const leaf_idx = node_to_leafs[ n->index() ];
182  if( leaf_idx == std::numeric_limits<std::size_t>::max() ) {
183  throw std::runtime_error(
184  "Node at index " + std::to_string( n->index() ) + " is not a leaf."
185  );
186  }
187  result.set( leaf_idx );
188  }
189  return result;
190 }
191 
192 std::vector<size_t> get_subtree_edges( TreeLink const& subtree )
193 {
194  std::vector<size_t> ret;
195 
196  // We don't want to use the standard iterator wrapper function here, as we are going
197  // to end the iteration after the end of the subtree, instead of iterating the whole tree.
198  // So we need to use the iterator class directly.
199  using Preorder = IteratorPreorder< true >;
200 
201  for(
202  auto it = Preorder( subtree.next() );
203  it != Preorder() && &it.link() != &subtree.outer();
204  ++it
205  ) {
206  if (it.is_first_iteration()) {
207  continue;
208  }
209  ret.push_back( it.edge().index() );
210  }
211 
212  return ret;
213 }
214 
216  Tree const& tree,
217  std::vector<Bipartition> const& bipartitions,
218  std::vector<TreeNode const*> const& nodes
219 ) {
220  // Error checks.
221  if( bipartitions.size() != tree.edge_count() ) {
222  throw std::runtime_error(
223  "Cannot find smallest subtree, as the nunber of given bipartitions does not match "
224  "the number of edges in the given tree. Use bipartition_set( tree ) to obtain a valid "
225  "set of bipartitions for the tree."
226  );
227  }
228 
229  // Get the bitvector to compare against
230  auto const comp = leaf_node_bitvector( tree, nodes );
231 
232  // Store best results.
233  Bipartition best_bip;
234  size_t min_count = 0;
235 
236  // Loop over all bipartitions and compare their bitvectors to the given one, to find one that
237  // is a superset. Try both ways (normal and inverted) for each bipartition.
238  for( Bipartition const& bip : bipartitions ) {
239  if( bip.empty() ) {
240  continue;
241  }
242  if( ! belongs_to( bip.link(), tree )) {
243  throw std::runtime_error(
244  "Cannot find smallest subtree, as the bipartitions were not extracted for the given "
245  "tree. Use bipartition_set( tree ) to obtain a valid "
246  "set of bipartitions for the tree."
247  );
248  }
249 
250  auto const inverted = ~(bip.leaf_nodes());
251  if( utils::is_subset( comp, bip.leaf_nodes() )) {
252  if( min_count == 0 || bip.leaf_nodes().count() < min_count ) {
253  best_bip = bip;
254  min_count = bip.leaf_nodes().count();
255  }
256  }
257  if( utils::is_subset( comp, inverted ) ) {
258  if (min_count == 0 || inverted.count() < min_count) {
259  best_bip = bip;
260  best_bip.invert();
261  min_count = bip.leaf_nodes().count();
262  }
263  }
264  }
265 
266  return best_bip;
267 }
268 
269 // =================================================================================================
270 // Monophyletic Subtree Functions
271 // =================================================================================================
272 
274  Tree const& tree,
275  std::vector<Bipartition> const& bipartitions,
276  std::vector<TreeNode const*> const& nodes,
277  bool include_splitting_edges,
278  bool include_leaf_edges
279 ) {
280  // Result. We use a bitvec of the edges that we want, to save space.
281  auto result_edges = utils::Bitvector( tree.edge_count() );
282 
283  // Helper funciton to set result edges of a bipartition.
284  auto set_result_edges = [&]( Bipartition const& bip ){
285 
286  // Add all subtree edges via custom traversal.
287  // (We do have preorder traversal for subtrees not, could use that. But too lazy right now...)
288  using Preorder = IteratorPreorder< true >;
289  for(
290  auto it = Preorder( bip.link().next() );
291  it != Preorder() && &it.link() != &bip.link().outer();
292  ++it
293  ) {
294  if (it.is_first_iteration()) {
295  continue;
296  }
297  result_edges.set( it.edge().index() );
298  }
299 
300  // Also add the edge of the split itself. This is necessary for leaves,
301  // but also we want to consider inner branches to be part of the clade.
302  if( include_splitting_edges || ( include_leaf_edges && is_leaf( bip.link().edge() ) )) {
303  result_edges.set( bip.link().edge().index() );
304  }
305  };
306 
307  // Get the bitvec that represets the leaf nodes we are looking for.
308  auto const leaves = leaf_node_bitvector( tree, nodes );
309 
310  // For each bip, check if one of its splits contains only nodes we are looking for.
311  // If so, add all edges of that split to the result.
312  for( auto const& bip : bipartitions ) {
313  if( bip.empty() ) {
314  continue;
315  }
316 
317  // If all tips of the bip are in our node list, we found a monophyletic clade.
318  if( ( bip.leaf_nodes() & leaves ) == bip.leaf_nodes() ) {
319  set_result_edges( bip );
320  }
321 
322  // Same for inverted case.
323  auto inverted = bip;
324  inverted.invert();
325  if( ( inverted.leaf_nodes() & leaves ) == inverted.leaf_nodes() ) {
326  set_result_edges( inverted );
327  }
328  }
329 
330  // Turn bitvec into edges by adding all indices where the bitvec is set.
331  std::vector<size_t> edges;
332  for( size_t i = 0; i < result_edges.size(); ++i ) {
333  if( result_edges.get( i ) ) {
334  edges.push_back( i );
335  }
336  }
337  return edges;
338 }
339 
341  Tree const& tree,
342  std::vector<TreeNode const*> const& nodes,
343  bool include_splitting_edges,
344  bool include_leaf_edges
345 ) {
346  auto const bipartitions = bipartition_set( tree );
348  tree, bipartitions, nodes, include_splitting_edges, include_leaf_edges
349  );
350 }
351 
353  Tree const& tree,
354  std::vector< std::string > const& node_names,
355  bool include_splitting_edges,
356  bool include_leaf_edges
357 ) {
358  auto const bipartitions = bipartition_set( tree );
359  auto const nodes = find_nodes( tree, node_names, true );
361  tree, bipartitions, nodes, include_splitting_edges, include_leaf_edges
362  );
363 }
364 
365 // =================================================================================================
366 // Whole Clade Functions
367 // =================================================================================================
368 
369 std::vector<size_t> get_clade_edges(
370  Tree const& tree,
371  std::vector< tree::TreeNode const* > const& nodes
372 ) {
373  // Find the edges that are part of the subtree of this clade.
374  auto const bipartitions = bipartition_set( tree );
375  auto const smallest = find_smallest_subtree( tree, bipartitions, nodes );
376  auto const subedges = get_subtree_edges( smallest.link() );
377  return subedges;
378 }
379 
380 std::vector<size_t> get_clade_edges(
381  Tree const& tree,
382  std::vector< std::string > const& node_names
383 ) {
384  return get_clade_edges( tree, find_nodes( tree, node_names, true ));
385 }
386 
387 } // namespace tree
388 } // namespace genesis
Provides some valuable algorithms that are not part of the C++ 11 STL.
size_t count() const
Count the number of set bits in the Bitvector, that is, its Hamming weight, or population count (popc...
Definition: bitvector.cpp:223
Bipartition find_smallest_subtree(Tree const &tree, std::vector< Bipartition > const &bipartitions, std::vector< TreeNode const *> const &nodes)
Find the smallest subtree (measured in number of nodes) that contains all given nodes.
Tree operator functions.
std::vector< size_t > get_clade_edges(Tree const &tree, std::vector< tree::TreeNode const * > const &nodes)
CommonTree functions.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
utils::Bitvector leaf_node_bitvector(Tree const &tree, std::vector< TreeNode const *> leaf_nodes)
Return a Bitvector that has as many entries as the tree has leaf nodes, and is true where the given l...
void set(size_t index)
Set the value of a single bit to true, with boundary check.
Definition: bitvector.hpp:147
utils::Bitvector const & leaf_nodes() const
Definition: bipartition.hpp:92
std::vector< TreeNode const * > find_nodes(Tree const &tree, std::vector< std::string > const &node_names, bool throw_on_failure, bool replace_underscores)
Find TreeNodes in a Tree, given their name.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:182
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
bool is_subset(Bitvector const &sub, Bitvector const &super)
Subset or equal.
std::vector< size_t > node_to_leaf_map(Tree const &tree)
size_t leaf_node_count(Tree const &tree)
Count the number of leaf Nodes of a Tree.
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
std::vector< Bipartition > bipartition_set(Tree const &tree)
Provides some commonly used string utility functions.
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree...
std::string name
Name of the node.
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
std::vector< size_t > get_subtree_edges(TreeLink const &subtree)
std::vector< size_t > find_monophyletic_subtree_edges(Tree const &tree, std::vector< Bipartition > const &bipartitions, std::vector< TreeNode const *> const &nodes, bool include_splitting_edges, bool include_leaf_edges)
Find clades of the tree that are monophyletic with respect to the given list of nodes, that is, clades that only contain nodes from that list. Return all edge indices of those clades.
Common class containing the commonly needed data for tree nodes.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
Header of Tree class.
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
std::vector< std::string > node_names(Tree const &tree, bool leaves_only)
Returns a list of all TreeNode names of a Tree.