A library for working with phylogenetic and population genetic data.
v0.27.0
tree/bipartition/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 #include "genesis/tree/tree.hpp"
45 
46 #include <cassert>
47 #include <limits>
48 #include <stdexcept>
49 #include <string>
50 
51 namespace genesis {
52 namespace tree {
53 
54 // =================================================================================================
55 // Bipartition Helper Functions
56 // =================================================================================================
57 
58 std::vector<Bipartition> bipartition_set( Tree const& tree )
59 {
60  // Result
61  const size_t num_leaves = leaf_node_count( tree );
62  auto bipartitions = std::vector<Bipartition>( tree.edge_count() );
63 
64  auto const node_to_leafs = node_to_leaf_map( tree );
65 
66  // Fill bitvectors
67  for( auto it : postorder(tree) ) {
68  if (it.is_last_iteration()) {
69  continue;
70  }
71 
72  auto bp = Bipartition( it.link(), utils::Bitvector( num_leaves ));
73 
74  // If the iterator is at a leaf, just set one bit in the bitvector.
75  if( is_leaf( it.node() ) ) {
76  auto const leaf_idx = node_to_leafs[ it.node().index() ];
77  assert( leaf_idx != std::numeric_limits<std::size_t>::max() );
78  bp.bitvector().set(leaf_idx);
79 
80  // For inner iterator positions, consider the whole subtree below it.
81  } else {
82  auto l = &it.link().next();
83  while( l != &it.link() ) {
84  assert( ! bipartitions[ l->edge().index() ].empty() );
85  bp.bitvector() |= bipartitions[ l->edge().index() ].leaf_nodes();
86  l = &l->next();
87  }
88  }
89 
90  // Store at the current edge
91  assert( bipartitions[ it.edge().index() ].empty() );
92  bipartitions[ it.edge().index() ] = bp;
93  }
94 
95  // Make sure all bips are filled.
96  assert(
97  [ &bipartitions ](){
98  for( auto const& bip : bipartitions ) {
99  if( bip.empty() ) {
100  return false;
101  }
102  }
103  return true;
104  }()
105  );
106 
107  return bipartitions;
108 }
109 
110 std::vector<size_t> node_to_leaf_map( Tree const& tree )
111 {
112  // Currently, we fix this so that leaves are always ordered.
113  // This is a bit slower, but better when working with multiple trees.
114  // In the future, this whole interface could be refined...
115  bool const sort_leaves = true;
116 
117  // Make an lookup of consecutive leaf nodes.
118  std::vector<size_t> nodes_to_leafs;
119  nodes_to_leafs.resize( tree.node_count() );
120 
121  // Find node string order if needed.
122  std::vector<size_t> name_order;
123  if( sort_leaves ) {
124 
125  // Make a sorted list of leave node names.
126  std::vector<std::string> node_names;
127  for( auto const& node : tree.nodes() ) {
128  if( is_leaf( node )) {
129  node_names.push_back( node.data<CommonNodeData>().name );
130  }
131  }
132 
133  // Get an order mapping list, that gives us the n-th index according to order.
134  // That is, it maps order to indices.
135  auto const ordered = utils::sort_indices( node_names.begin(), node_names.end() );
136 
137  // Check for duplicate names.
138  for( size_t i = 1; i < ordered.size(); ++i ) {
139  if( node_names[ ordered[ i ]] == node_names[ ordered[ i - 1 ]] ) {
140  throw std::runtime_error(
141  "Cannot build bipartitions for Tree that has duplicate node names."
142  );
143  }
144  }
145 
146  // Reverse the mapping, so that we get a lookup from index to order.
147  name_order.resize( ordered.size() );
148  for( size_t i = 0; i < ordered.size(); ++i ) {
149  name_order[ ordered[i] ] = i;
150  }
151  }
152 
153  // Assign indices to each node.
154  size_t leaf_idx = 0;
155  for( auto const& node : tree.nodes() ) {
156  if( is_leaf( node )) {
157  if( sort_leaves ) {
158 
159  // Use the index to order map to get the ordered leaf index.
160  assert( leaf_idx < name_order.size() );
161  nodes_to_leafs[ node.index() ] = name_order[ leaf_idx ];
162  } else {
163 
164  // Unsorted case: just count upwards.
165  nodes_to_leafs[ node.index() ] = leaf_idx;
166  }
167  ++leaf_idx;
168  } else {
169  nodes_to_leafs[ node.index() ] = std::numeric_limits<std::size_t>::max();
170  }
171  }
172 
173  return nodes_to_leafs;
174 }
175 
176 utils::Bitvector leaf_node_bitvector( Tree const& tree, std::vector<TreeNode const*> leaf_nodes )
177 {
178  auto const node_to_leafs = node_to_leaf_map( tree );
179  utils::Bitvector result( leaf_node_count( tree ));
180  for( auto n : leaf_nodes ) {
181  auto const leaf_idx = node_to_leafs[ n->index() ];
182  if( leaf_idx == std::numeric_limits<std::size_t>::max() ) {
183  throw std::runtime_error(
184  "Node at index " + std::to_string( n->index() ) + " is not a leaf."
185  );
186  }
187  result.set( leaf_idx );
188  }
189  return result;
190 }
191 
192 std::vector<size_t> get_subtree_edges( TreeLink const& subtree )
193 {
194  std::vector<size_t> ret;
195 
196  // We don't want to use the standard iterator wrapper function here, as we are going
197  // to end the iteration after the end of the subtree, instead of iterating the whole tree.
198  // So we need to use the iterator class directly.
199  using Preorder = IteratorPreorder< true >;
200 
201  for(
202  auto it = Preorder( subtree.next() );
203  it != Preorder() && &it.link() != &subtree.outer();
204  ++it
205  ) {
206  if (it.is_first_iteration()) {
207  continue;
208  }
209  ret.push_back( it.edge().index() );
210  }
211 
212  return ret;
213 }
214 
216  Tree const& tree,
217  std::vector<Bipartition> const& bipartitions,
218  std::vector<TreeNode const*> const& nodes
219 ) {
220  // Error checks.
221  if( bipartitions.size() != tree.edge_count() ) {
222  throw std::runtime_error(
223  "Cannot find smallest subtree, as the nunber of given bipartitions does not match "
224  "the number of edges in the given tree. Use bipartition_set( tree ) to obtain a valid "
225  "set of bipartitions for the tree."
226  );
227  }
228 
229  // Get the bitvector to compare against
230  auto const comp = leaf_node_bitvector( tree, nodes );
231 
232  // Store best results.
233  Bipartition best_bip;
234  size_t min_count = 0;
235 
236  // Loop over all bipartitions and compare their bitvectors to the given one, to find one that
237  // is a superset. Try both ways (normal and inverted) for each bipartition.
238  for( Bipartition const& bip : bipartitions ) {
239  if( bip.empty() ) {
240  continue;
241  }
242  if( ! belongs_to( bip.link(), tree )) {
243  throw std::runtime_error(
244  "Cannot find smallest subtree, as the bipartitions were not extracted for the given "
245  "tree. Use bipartition_set( tree ) to obtain a valid "
246  "set of bipartitions for the tree."
247  );
248  }
249 
250  auto const inverted = ~(bip.leaf_nodes());
251  if( utils::is_subset( comp, bip.leaf_nodes() )) {
252  if( min_count == 0 || bip.leaf_nodes().count() < min_count ) {
253  best_bip = bip;
254  min_count = best_bip.leaf_nodes().count();
255  }
256  }
257  if( utils::is_subset( comp, inverted ) ) {
258  if( min_count == 0 || inverted.count() < min_count ) {
259  best_bip = bip;
260  best_bip.invert();
261 
262  assert( best_bip.leaf_nodes().count() == inverted.count() );
263  min_count = best_bip.leaf_nodes().count();
264  }
265  }
266  }
267 
268  return best_bip;
269 }
270 
271 // =================================================================================================
272 // Monophyletic Subtree Functions
273 // =================================================================================================
274 
276  Tree const& tree,
277  std::vector<Bipartition> const& bipartitions,
278  std::vector<TreeNode const*> const& nodes,
279  bool include_splitting_edges,
280  bool include_leaf_edges
281 ) {
282  // Result. We use a bitvec of the edges that we want, to save space.
283  auto result_edges = utils::Bitvector( tree.edge_count() );
284 
285  // Helper funciton to set result edges of a bipartition.
286  auto set_result_edges = [&]( Bipartition const& bip ){
287 
288  // Add all subtree edges via custom traversal.
289  // (We do have preorder traversal for subtrees not, could use that. But too lazy right now...)
290  using Preorder = IteratorPreorder< true >;
291  for(
292  auto it = Preorder( bip.link().next() );
293  it != Preorder() && &it.link() != &bip.link().outer();
294  ++it
295  ) {
296  if (it.is_first_iteration()) {
297  continue;
298  }
299  result_edges.set( it.edge().index() );
300  }
301 
302  // Also add the edge of the split itself. This is necessary for leaves,
303  // but also we want to consider inner branches to be part of the clade.
304  if( include_splitting_edges || ( include_leaf_edges && is_leaf( bip.link().edge() ) )) {
305  result_edges.set( bip.link().edge().index() );
306  }
307  };
308 
309  // Get the bitvec that represets the leaf nodes we are looking for.
310  auto const leaves = leaf_node_bitvector( tree, nodes );
311 
312  // For each bip, check if one of its splits contains only nodes we are looking for.
313  // If so, add all edges of that split to the result.
314  for( auto const& bip : bipartitions ) {
315  if( bip.empty() ) {
316  continue;
317  }
318 
319  // If all tips of the bip are in our node list, we found a monophyletic clade.
320  if( ( bip.leaf_nodes() & leaves ) == bip.leaf_nodes() ) {
321  set_result_edges( bip );
322  }
323 
324  // Same for inverted case.
325  auto inverted = bip;
326  inverted.invert();
327  if( ( inverted.leaf_nodes() & leaves ) == inverted.leaf_nodes() ) {
328  set_result_edges( inverted );
329  }
330  }
331 
332  // Turn bitvec into edges by adding all indices where the bitvec is set.
333  std::vector<size_t> edges;
334  for( size_t i = 0; i < result_edges.size(); ++i ) {
335  if( result_edges.get( i ) ) {
336  edges.push_back( i );
337  }
338  }
339  return edges;
340 }
341 
343  Tree const& tree,
344  std::vector<TreeNode const*> const& nodes,
345  bool include_splitting_edges,
346  bool include_leaf_edges
347 ) {
348  auto const bipartitions = bipartition_set( tree );
350  tree, bipartitions, nodes, include_splitting_edges, include_leaf_edges
351  );
352 }
353 
355  Tree const& tree,
356  std::vector< std::string > const& node_names,
357  bool include_splitting_edges,
358  bool include_leaf_edges
359 ) {
360  auto const bipartitions = bipartition_set( tree );
361  auto const nodes = find_nodes( tree, node_names, true );
363  tree, bipartitions, nodes, include_splitting_edges, include_leaf_edges
364  );
365 }
366 
367 // =================================================================================================
368 // Whole Clade Functions
369 // =================================================================================================
370 
371 std::vector<size_t> get_clade_edges(
372  Tree const& tree,
373  std::vector< tree::TreeNode const* > const& nodes
374 ) {
375  // Find the edges that are part of the subtree of this clade.
376  auto const bipartitions = bipartition_set( tree );
377  auto const smallest = find_smallest_subtree( tree, bipartitions, nodes );
378  auto const subedges = get_subtree_edges( smallest.link() );
379  return subedges;
380 }
381 
382 std::vector<size_t> get_clade_edges(
383  Tree const& tree,
384  std::vector< std::string > const& node_names
385 ) {
386  return get_clade_edges( tree, find_nodes( tree, node_names, true ));
387 }
388 
389 } // namespace tree
390 } // namespace genesis
genesis::tree::Bipartition::leaf_nodes
utils::Bitvector const & leaf_nodes() const
Definition: bipartition.hpp:92
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
genesis::tree::leaf_node_count
size_t leaf_node_count(Tree const &tree)
Count the number of leaf Nodes of a Tree.
Definition: tree/function/functions.cpp:168
genesis::utils::Bitvector::set
void set(size_t index)
Set the value of a single bit to true, with boundary check.
Definition: bitvector.hpp:147
genesis::tree::Tree::node_count
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
genesis::tree::find_smallest_subtree
Bipartition find_smallest_subtree(Tree const &tree, std::vector< Bipartition > const &bipartitions, std::vector< TreeNode const * > const &nodes)
Find the smallest subtree (measured in number of nodes) that contains all given nodes.
Definition: tree/bipartition/functions.cpp:215
functions.hpp
CommonTree functions.
genesis::tree::IteratorPreorder
Definition: tree/iterator/preorder.hpp:61
genesis::tree::get_subtree_edges
std::vector< size_t > get_subtree_edges(TreeLink const &subtree)
Definition: tree/bipartition/functions.cpp:192
genesis::tree::node_names
std::vector< std::string > node_names(Tree const &tree, bool leaves_only)
Returns a list of all TreeNode names of a Tree.
Definition: tree/common_tree/functions.cpp:52
tree.hpp
Header of Tree class.
functions.hpp
genesis::utils::sort_indices
std::vector< size_t > sort_indices(RandomAccessIterator first, RandomAccessIterator last, Comparator comparator)
Get the indices to the sorted order of the given range.
Definition: algorithm.hpp:182
genesis::tree::belongs_to
bool belongs_to(Tree const &tree, TreeNode const &node)
Return whether the TreeNode belongs to the Tree, i.e., whether it is owned by the Tree.
Definition: tree/function/operators.cpp:221
genesis::tree::Bipartition::invert
void invert()
Definition: bipartition.hpp:97
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
string.hpp
Provides some commonly used string utility functions.
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
subtree.hpp
operators.hpp
bipartition.hpp
genesis::tree::CommonNodeData::name
std::string name
Name of the node.
Definition: tree/common_tree/tree.hpp:127
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::find_nodes
std::vector< TreeNode const * > find_nodes(Tree const &tree, std::vector< std::string > const &node_names, bool throw_on_failure, bool replace_underscores)
Find TreeNodes in a Tree, given their name.
Definition: tree/common_tree/functions.cpp:122
operators.hpp
Tree operator functions.
genesis::tree::Tree::nodes
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
tree.hpp
genesis::tree::leaf_node_bitvector
utils::Bitvector leaf_node_bitvector(Tree const &tree, std::vector< TreeNode const * > leaf_nodes)
Return a Bitvector that has as many entries as the tree has leaf nodes, and is true where the given l...
Definition: tree/bipartition/functions.cpp:176
genesis::tree::find_monophyletic_subtree_edges
std::vector< size_t > find_monophyletic_subtree_edges(Tree const &tree, std::vector< Bipartition > const &bipartitions, std::vector< TreeNode const * > const &nodes, bool include_splitting_edges, bool include_leaf_edges)
Find clades of the tree that are monophyletic with respect to the given list of nodes,...
Definition: tree/bipartition/functions.cpp:275
genesis::tree::Bipartition
Definition: bipartition.hpp:47
genesis::utils::is_subset
bool is_subset(Bitvector const &sub, Bitvector const &super)
Subset or equal.
Definition: utils/math/bitvector/operators.cpp:107
genesis::tree::CommonNodeData
Common class containing the commonly needed data for tree nodes.
Definition: tree/common_tree/tree.hpp:79
genesis::tree::node_to_leaf_map
std::vector< size_t > node_to_leaf_map(Tree const &tree)
Definition: tree/bipartition/functions.cpp:110
genesis::utils::Bitvector::count
size_t count() const
Count the number of set bits in the Bitvector, that is, its Hamming weight, or population count (popc...
Definition: bitvector.cpp:223
postorder.hpp
functions.hpp
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
genesis::tree::postorder
utils::Range< IteratorPostorder< true > > postorder(ElementType const &element)
Definition: tree/iterator/postorder.hpp:320
preorder.hpp
genesis::tree::bipartition_set
std::vector< Bipartition > bipartition_set(Tree const &tree)
Definition: tree/bipartition/functions.cpp:58
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
genesis::tree::get_clade_edges
std::vector< size_t > get_clade_edges(Tree const &tree, std::vector< tree::TreeNode const * > const &nodes)
Definition: tree/bipartition/functions.cpp:371
genesis::utils::Bitvector
Definition: bitvector.hpp:48