A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/bipartition/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 #include "genesis/tree/tree.hpp"
43 
44 #include <cassert>
45 #include <limits>
46 #include <stdexcept>
47 
48 namespace genesis {
49 namespace tree {
50 
51 // =================================================================================================
52 // Bipartition Functions
53 // =================================================================================================
54 
55 std::vector<Bipartition> bipartition_set( Tree const& tree )
56 {
57  // Result
58  const size_t num_leaves = leaf_node_count( tree );
59  auto bipartitions = std::vector<Bipartition>( tree.edge_count() );
60 
61  auto const node_to_leafs = node_to_leaf_map( tree );
62 
63  // Fill bitvectors
64  for( auto it : postorder(tree) ) {
65  if (it.is_last_iteration()) {
66  continue;
67  }
68 
69  auto bp = Bipartition( it.link(), utils::Bitvector( num_leaves ));
70 
71  // If the iterator is at a leaf, just set one bit in the bitvector.
72  if( it.node().is_leaf() ) {
73  auto const leaf_idx = node_to_leafs[ it.node().index() ];
74  assert( leaf_idx != std::numeric_limits<std::size_t>::max() );
75  bp.bitvector().set(leaf_idx);
76 
77  // For inner iterator positions, consider the whole subtree below it.
78  } else {
79  auto l = &it.link().next();
80  while( l != &it.link() ) {
81  assert( ! bipartitions[ l->edge().index() ].empty() );
82  bp.bitvector() |= bipartitions[ l->edge().index() ].leaf_nodes();
83  l = &l->next();
84  }
85  }
86 
87  // Store at the current edge
88  assert( bipartitions[ it.edge().index() ].empty() );
89  bipartitions[ it.edge().index() ] = bp;
90  }
91 
92  // Make sure all bips are filled.
93  assert(
94  [ &bipartitions ](){
95  for( auto const& bip : bipartitions ) {
96  if( bip.empty() ) {
97  return false;
98  }
99  }
100  return true;
101  }()
102  );
103 
104  return bipartitions;
105 }
106 
107 std::vector<size_t> node_to_leaf_map( Tree const& tree )
108 {
109  // Currently, we fix this so that leaves are always ordered.
110  // This is a bit slower, but better when working with multiple trees.
111  // In the future, this whole interface could be refined...
112  bool const sort_leaves = true;
113 
114  // Make an lookup of consecutive leaf nodes.
115  std::vector<size_t> nodes_to_leafs;
116  nodes_to_leafs.resize( tree.node_count() );
117 
118  // Find node string order if needed.
119  std::vector<size_t> name_order;
120  if( sort_leaves ) {
121 
122  // Make a sorted list of leave node names.
123  std::vector<std::string> node_names;
124  for( auto const& node_it : tree.nodes() ) {
125  if( node_it->is_leaf() ) {
126  node_names.push_back( node_it->data<DefaultNodeData>().name );
127  }
128  }
129 
130  // Get an order mapping list, that gives us the n-th index according to order.
131  // That is, it maps order to indices.
132  auto const ordered = utils::sort_indices( node_names.begin(), node_names.end() );
133 
134  // Check for duplicate names.
135  for( size_t i = 1; i < ordered.size(); ++i ) {
136  if( node_names[ ordered[ i ]] == node_names[ ordered[ i - 1 ]] ) {
137  throw std::runtime_error(
138  "Cannot build bipartitions for Tree that has duplicate node names."
139  );
140  }
141  }
142 
143  // Reverse the mapping, so that we get a lookup from index to order.
144  name_order.resize( ordered.size() );
145  for( size_t i = 0; i < ordered.size(); ++i ) {
146  name_order[ ordered[i] ] = i;
147  }
148  }
149 
150  // Assign indices to each node.
151  size_t leaf_idx = 0;
152  for( auto const& node_it : tree.nodes() ) {
153  if( node_it->is_leaf() ) {
154  if( sort_leaves ) {
155 
156  // Use the index to order map to get the ordered leaf index.
157  assert( leaf_idx < name_order.size() );
158  nodes_to_leafs[ node_it->index() ] = name_order[ leaf_idx ];
159  } else {
160 
161  // Unsorted case: just count upwards.
162  nodes_to_leafs[ node_it->index() ] = leaf_idx;
163  }
164  ++leaf_idx;
165  } else {
166  nodes_to_leafs[ node_it->index() ] = std::numeric_limits<std::size_t>::max();
167  }
168  }
169 
170  return nodes_to_leafs;
171 }
172 
173 utils::Bitvector leaf_node_bitvector( Tree const& tree, std::vector<TreeNode const*> leaf_nodes )
174 {
175  auto const node_to_leafs = node_to_leaf_map( tree );
176  utils::Bitvector result( leaf_node_count( tree ));
177  for( auto n : leaf_nodes ) {
178  auto const leaf_idx = node_to_leafs[ n->index() ];
179  if( leaf_idx == std::numeric_limits<std::size_t>::max() ) {
180  throw std::runtime_error(
181  "Node at index " + utils::to_string( n->index() ) + " is not a leaf."
182  );
183  }
184  result.set( leaf_idx );
185  }
186  return result;
187 }
188 
189 std::vector<size_t> get_subtree_edges( TreeLink const& subtree )
190 {
191  std::vector<size_t> ret;
192 
193  // We don't want to use the standard iterator wrapper function here, as we are going
194  // to end the iteration after the end of the subtree, instead of iterating the whole tree.
195  // So we need to use the iterator class directly.
197 
198  for(
199  auto it = Preorder( subtree.next() );
200  it != Preorder() && &it.link() != &subtree.outer();
201  ++it
202  ) {
203  if (it.is_first_iteration()) {
204  continue;
205  }
206  ret.push_back( it.edge().index() );
207  }
208 
209  return ret;
210 }
211 
213  Tree const& tree,
214  std::vector<Bipartition> const& bips,
215  std::vector<TreeNode const*> nodes
216 ) {
217  // Result. We use a bitvec of the edges that we want, to save space.
218  auto result_edges = utils::Bitvector( tree.edge_count() );
219 
220  // Helper funciton to set result edges of a bipartition.
221  auto set_result_edges = [ &result_edges ]( Bipartition const& bip ){
222 
223  // Add all subtree edges via custom traversal
225  for(
226  auto it = Preorder( bip.link().next() );
227  it != Preorder() && &it.link() != &bip.link().outer();
228  ++it
229  ) {
230  if (it.is_first_iteration()) {
231  continue;
232  }
233  result_edges.set( it.edge().index() );
234  }
235 
236  // Also add the edge of the split itself. This is necessary for leaves,
237  // but also we want to consider inner branches to be part of the clade.
238  result_edges.set( bip.link().edge().index() );
239  };
240 
241  // Get the bitvec that represets the leaf nodes we are looking for.
242  auto const leaves = leaf_node_bitvector( tree, nodes );
243 
244  // For each bip, check if one of its splits contains only nodes we are looking for.
245  // If so, add all edges of that split to the result.
246  for( auto const& bip : bips ) {
247  if( bip.empty() ) {
248  continue;
249  }
250 
251  // If all tips of the bip are in our node list, we found a monophyletic clade.
252  if( ( bip.leaf_nodes() & leaves ) == bip.leaf_nodes() ) {
253  set_result_edges( bip );
254  }
255 
256  // Same for inverted case.
257  auto inverted = bip;
258  inverted.invert();
259  if( ( inverted.leaf_nodes() & leaves ) == inverted.leaf_nodes() ) {
260  set_result_edges( inverted );
261  }
262  }
263 
264  // Turn bitvec into edges by adding all indices where the bitvec is set.
265  std::vector<size_t> edges;
266  for( size_t i = 0; i < result_edges.size(); ++i ) {
267  if( result_edges.get( i ) ) {
268  edges.push_back( i );
269  }
270  }
271  return edges;
272 }
273 
275  Tree const& tree,
276  std::vector<Bipartition> const& bipartitions,
277  std::vector<TreeNode const*> nodes
278 ) {
279  // Get the bitvector to compare against
280  auto const comp = leaf_node_bitvector( tree, nodes );
281 
282  // Store best results.
283  Bipartition best_bip;
284  size_t min_count = 0;
285 
286  // Loop over all bipartitions and compare their bitvectors to the given one, to find one that
287  // is a superset. Try both ways (normal and inverted) for each bipartition.
288  for( Bipartition const& bip : bipartitions ) {
289  if( bip.empty() ) {
290  continue;
291  }
292 
293  auto const inverted = ~(bip.leaf_nodes());
294  if( utils::is_subset( comp, bip.leaf_nodes() )) {
295  if( min_count == 0 || bip.leaf_nodes().count() < min_count ) {
296  best_bip = bip;
297  min_count = bip.leaf_nodes().count();
298  }
299  }
300  if( utils::is_subset( comp, inverted ) ) {
301  if (min_count == 0 || inverted.count() < min_count) {
302  best_bip = bip;
303  best_bip.invert();
304  min_count = bip.leaf_nodes().count();
305  }
306  }
307  }
308 
309  return best_bip;
310 }
311 
312 std::vector<size_t> get_clade_edges( Tree const& tree, std::vector< tree::TreeNode const* > const& nodes )
313 {
314  // Find the edges that are part of the subtree of this clade.
315  // This part is a bit messy and might be cleaned up in the future.
316  auto const bipartitions = bipartition_set( tree );
317  auto const smallest = find_smallest_subtree( tree, bipartitions, nodes );
318  auto const subedges = get_subtree_edges( smallest.link() );
319  return subedges;
320 }
321 
322 std::vector<size_t> get_clade_edges( Tree const& tree, std::vector< std::string > const& node_names )
323 {
324  // Find the nodes that belong to the taxa of this clade.
325  std::vector< tree::TreeNode const* > node_list;
326  for( auto const& taxon : node_names ) {
327  auto node = find_node( tree, taxon );
328  if( node == nullptr ) {
329  throw std::runtime_error( "Cannot find node " + taxon + " in tree." );
330  }
331  node_list.push_back( node );
332  }
333  return get_clade_edges( tree, node_list );
334 }
335 
336 } // namespace tree
337 } // namespace genesis
size_t count() const
Count the number of set bits in the Bitvector, that is, its Hamming weight.
Definition: bitvector.cpp:174
Provides some valuable algorithms that are not part of the C++ 11 STL.
std::unordered_set< std::string > node_names(Tree const &tree, bool leaves_only)
Returns an unordered set of all TreeNode names of a Tree.
std::vector< size_t > find_monophyletic_subtree_edges(Tree const &tree, std::vector< Bipartition > const &bips, std::vector< TreeNode const * > nodes)
Find clades of the tree that are monophyletic with respect to the given list of nodes, that is, clades that only contain nodes from that list. Retun all edge indices of those clades.
std::vector< size_t > get_clade_edges(Tree const &tree, std::vector< tree::TreeNode const * > const &nodes)
Default class containing the commonly needed data for tree nodes.
TreeNode const * find_node(Tree const &tree, const std::string &name, bool replace_underscores)
Finds a Node, given its name. If not found, nullptr is returned.
size_t leaf_node_count(Tree const &tree)
Count the number of leaf Nodes of a Tree.
void set(size_t index)
Set the value of a single bit to true, with boundary check.
Definition: bitvector.hpp:139
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.cpp:484
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:373
std::vector< Bipartition > bipartition_set(Tree const &tree)
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:124
utils::Bitvector const & leaf_nodes() const
Definition: bipartition.hpp:81
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.cpp:350
std::vector< size_t > node_to_leaf_map(Tree const &tree)
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
Provides some commonly used string utility functions.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
std::vector< size_t > get_subtree_edges(TreeLink const &subtree)
Default Tree functions.
bool is_subset(Bitvector const &sub, Bitvector const &super)
Subset or equal.
std::string name
Name of the node.
Bipartition find_smallest_subtree(Tree const &tree, std::vector< Bipartition > const &bipartitions, std::vector< TreeNode const * > nodes)
Find the smallest subtree (measured in number of nodes) that contains all given nodes.
Header of Tree class.
utils::Range< IteratorPostorder< TreeLink const, TreeNode const, TreeEdge const > > postorder(ElementType const &element)
utils::Bitvector leaf_node_bitvector(Tree const &tree, std::vector< TreeNode const * > leaf_nodes)
Return a Bitvector that has as many entries as the tree has leaf nodes, and is true where the given l...