A library for working with phylogenetic and population genetic data.
v0.32.0
tree/common_tree/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
41 
42 #include <algorithm>
43 #include <stdexcept>
44 
45 namespace genesis {
46 namespace tree {
47 
48 // =================================================================================================
49 // Node Names
50 // =================================================================================================
51 
52 std::vector<std::string> node_names(
53  Tree const& tree,
54  bool leaves_only
55 ) {
56  std::vector<std::string> names;
57  for( auto const& node : tree.nodes() ) {
58  if( is_inner( node ) && leaves_only ) {
59  continue;
60  }
61  auto const name = node.data<CommonNodeData>().name;
62  if( name.empty() ) {
63  continue;
64  }
65  names.push_back( std::move( name ));
66  }
67  return names;
68 }
69 
70 std::vector<std::string> node_names(
71  std::vector<Tree> const& tree_set,
72  bool leaves_only
73 ) {
74  // It would be faster to directly insert into the resulting container, but this version
75  // avoids code duplication and is fast enough for now.
76  std::vector<std::string> names;
77  for( auto const& tree : tree_set ) {
78  auto tree_name_set = node_names( tree, leaves_only );
79  names.insert( names.end(), tree_name_set.begin(), tree_name_set.end() );
80  }
81  return names;
82 }
83 
85  Tree const& tree,
86  const std::string& name,
87  bool throw_on_failure,
88  bool replace_underscores
89 ) {
90  auto clean_name = name;
91  if (replace_underscores) {
92  clean_name = utils::replace_all(name, "_", " ");
93  }
94 
95  // Try to find the node, return immediately on success.
96  for( auto const& node : tree.nodes() ) {
97  if( node.data<CommonNodeData>().name == clean_name ) {
98  return &node;
99  }
100  }
101 
102  // If we are here, we did not find the not. Fail.
103  if( throw_on_failure ) {
104  throw std::invalid_argument( "Cannot find node '" + name + "' in tree." );
105  }
106  return nullptr;
107 }
108 
110  Tree& tree,
111  const std::string& name,
112  bool throw_on_failure,
113  bool replace_underscores
114 ) {
115  // Avoid code duplication according to Scott Meyers.
116  auto const& ctree = static_cast< Tree const& >( tree );
117  return const_cast< TreeNode* >(
118  find_node( ctree, name, throw_on_failure, replace_underscores )
119  );
120 }
121 
122 std::vector<TreeNode const*> find_nodes(
123  Tree const& tree,
124  std::vector<std::string> const& node_names,
125  bool throw_on_failure,
126  bool replace_underscores
127 ) {
128  // Prepare result
129  std::vector<TreeNode const*> node_list;
130  node_list.reserve( node_names.size() );
131 
132  // Find and return nodes
133  for( auto const& taxon : node_names ) {
134  node_list.push_back( find_node( tree, taxon, throw_on_failure, replace_underscores ));
135  }
136  return node_list;
137 }
138 
139 std::vector<TreeNode*> find_nodes(
140  Tree& tree,
141  std::vector<std::string> const& node_names,
142  bool throw_on_failure,
143  bool replace_underscores
144 ) {
145  // Prepare result
146  std::vector<TreeNode*> node_list;
147  node_list.reserve( node_names.size() );
148 
149  // Find and return nodes
150  for( auto const& taxon : node_names ) {
151  node_list.push_back( find_node( tree, taxon, throw_on_failure, replace_underscores ));
152  }
153  return node_list;
154 }
155 
156 // =================================================================================================
157 // Branch Length
158 // =================================================================================================
159 
160 double length(Tree const& tree)
161 {
162  double len = 0.0;
163  for( auto const& edge : tree.edges() ) {
164  len += edge.data<CommonEdgeData>().branch_length;
165  }
166  return len;
167 }
168 
169 double height(Tree const& tree)
170 {
171  if( tree.empty() ) {
172  return 0.0;
173  }
174  auto dists = node_branch_length_distance_vector(tree);
175  return *std::max_element(dists.begin(), dists.end());
176 }
177 
178 double diameter( Tree const& tree )
179 {
180  if( tree.empty() ) {
181  return 0.0;
182  }
183 
184  // Finding the diameter of a tee (as a graph) can be done via BFS from the root to find one of
185  // the ends of the dimater, and then another BFS from that leaf to the other end of the dimater:
186  // https://cs.stackexchange.com/questions/22855/algorithm-to-find-diameter-of-a-tree-using-bfs-dfs-why-does-it-work
187  // This however goes by number of nodes, instead of branch lengths. Still, I'm fairly certain
188  // that the same logic applies, and so we can use this technique to find our dimater with
189  // just two BFSs and only little memory!
190 
191  // Find the node that is furthest from the root. It has to be a leaf.
192  auto const root_dists = node_branch_length_distance_vector( tree );
193  auto const max_idx_1 = std::distance(
194  root_dists.begin(), std::max_element( root_dists.begin(), root_dists.end() )
195  );
196  assert( root_dists.size() == tree.node_count() );
197  assert( is_leaf( tree.node_at( max_idx_1 ) ));
198 
199  // Now find the node that is furthest from that previously found node,
200  // and return its distance. We do not need to identify its index;
201  // we can just return the max element of the list of distances.
202  auto const far_dists = node_branch_length_distance_vector( tree, &tree.node_at( max_idx_1 ));
203  assert( far_dists.size() == tree.node_count() );
204  return *std::max_element( far_dists.begin(), far_dists.end() );
205 
206  // Old, slow, and very memory intensive for large trees.
207  // auto dist_mat = node_branch_length_distance_matrix( tree );
208  // assert( ! dist_mat.empty() );
209  // assert( dist_mat.rows() == dist_mat.cols() );
210  // return *std::max_element( dist_mat.begin(), dist_mat.end() );
211 }
212 
213 std::vector<double> branch_lengths(
214  Tree const& tree
215 ) {
216  std::vector<double> result;
217  result.reserve( tree.edge_count() );
218  for( size_t i = 0; i < tree.edge_count(); ++i ) {
219  result.push_back( tree.edge_at(i).data<CommonEdgeData>().branch_length );
220  }
221  return result;
222 }
223 
225  Tree& tree,
226  double length
227 ) {
228  for( auto& edge : tree.edges() ) {
229  edge.data<CommonEdgeData>().branch_length = length;
230  }
231 }
232 
234  Tree& tree,
235  double factor
236 ) {
237  for( auto& edge : tree.edges() ) {
238  edge.data<CommonEdgeData>().branch_length *= factor;
239  }
240 }
241 
242 Tree average_branch_length_tree( std::vector<Tree> const& tset )
243 {
244  if( tset.size() == 0 ) {
245  return Tree();
246  }
247 
248  if( ! identical_topology( tset )) {
249  throw std::runtime_error(
250  "Cannot calculate average branch length tree. Trees do not have the same topology."
251  );
252  }
253 
254  // Prepare storage for average branch lengths.
255  size_t num_edges = tset.at(0).edge_count();
256  auto avgs = std::vector<double>(num_edges, 0.0);
257 
258  // We traverse all trees (again, because all_identical_topology() already did this). This is
259  // probably a bit slower than the previous version of this method which worked with less
260  // traversals, but way easier to understand and debug.
261  for( auto& ct : tset ) {
262  // Use an index for position in the preorder traversal. This makes sure that the
263  // index actually always points to the correct edges, indepently of their order in
264  // different trees in the set.
265  size_t idx = 0;
266 
267  // Do a preorder traversal and collect branch lengths.
268  for( auto it : preorder(ct) ) {
269  // The first iteration points to an edge which will be covered later again.
270  // Skip it to prevent double coverage.
271  if (it.is_first_iteration()) {
272  continue;
273  }
274 
275  avgs[idx] += it.edge().data<CommonEdgeData>().branch_length;
276  ++idx;
277  }
278  }
279 
280  // We know that all trees have the same topology. So we take a copy of the first one
281  // (thus, also copying its node names) and modify its branch lengths.
282  auto tree = tset.at(0);
283 
284  // Do the same kind of traversal as before in order to keep the indexing order (preorder) and
285  // set the branch lengths.
286  size_t idx = 0;
287  for( auto it : preorder(tree) ) {
288  // The first iteration points to an edge which will be covered later again.
289  // Skip it to prevent double coverage.
290  if (it.is_first_iteration()) {
291  continue;
292  }
293 
294  it.edge().data<CommonEdgeData>().branch_length = avgs[idx] / tset.size();
295  ++idx;
296  }
297 
298  return tree;
299 }
300 
301 } // namespace tree
302 } // namespace genesis
genesis::tree::Tree::node_count
size_t node_count() const
Return the number of TreeNodes of the Tree.
Definition: tree/tree.hpp:264
functions.hpp
CommonTree functions.
genesis::tree::height
double height(Tree const &tree)
Get the height of the tree, i.e., the longest distance from the root to a leaf, measured using the br...
Definition: tree/common_tree/functions.cpp:169
genesis::utils::replace_all
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string.
Definition: string.cpp:727
genesis::tree::node_branch_length_distance_vector
std::vector< double > node_branch_length_distance_vector(Tree const &tree, TreeNode const *node)
Return a vector containing the distance of all nodes with respect to the given start node,...
Definition: tree/common_tree/distances.cpp:121
genesis::tree::node_names
std::vector< std::string > node_names(Tree const &tree, bool leaves_only)
Returns a list of all TreeNode names of a Tree.
Definition: tree/common_tree/functions.cpp:52
genesis::tree::is_inner
bool is_inner(TreeLink const &link)
Return true iff the node of the given link is an inner node.
Definition: tree/function/functions.cpp:74
genesis::placement::tree_set
tree::TreeSet tree_set(SampleSet const &sample_set)
Return a TreeSet containing all the trees of the SampleSet.
Definition: sample_set.cpp:156
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
tree_set.hpp
functions.hpp
tree_set.hpp
genesis::tree::Tree::node_at
TreeNode & node_at(size_t index)
Return the TreeNode at a certain index.
Definition: tree/tree.hpp:220
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
genesis::tree::preorder
utils::Range< IteratorPreorder< true > > preorder(ElementType const &element)
Definition: tree/iterator/preorder.hpp:264
genesis::tree::identical_topology
bool identical_topology(Tree const &lhs, Tree const &rhs, bool identical_indices)
Return whether both trees have an identical topology.
Definition: tree/function/operators.cpp:169
string.hpp
Provides some commonly used string utility functions.
genesis::tree::diameter
double diameter(Tree const &tree)
Get the diameter of the tree, i.e., the longest distance between any two nodes, measured using the br...
Definition: tree/common_tree/functions.cpp:178
distances.hpp
Header of CommonTree distance methods.
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
genesis::tree::Tree::empty
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
genesis::tree::CommonEdgeData::branch_length
double branch_length
Branch length of the edge.
Definition: tree/common_tree/tree.hpp:193
genesis::tree::CommonNodeData::name
std::string name
Name of the node.
Definition: tree/common_tree/tree.hpp:127
genesis::tree::TreeNode
Definition: tree/tree/node.hpp:58
genesis::tree::branch_lengths
std::vector< double > branch_lengths(Tree const &tree)
Get a vector of all branch lengths of a Tree, index by the edge index.
Definition: tree/common_tree/functions.cpp:213
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::tree::find_nodes
std::vector< TreeNode const * > find_nodes(Tree const &tree, std::vector< std::string > const &node_names, bool throw_on_failure, bool replace_underscores)
Find TreeNodes in a Tree, given their name.
Definition: tree/common_tree/functions.cpp:122
operators.hpp
Tree operator functions.
genesis::tree::Tree::nodes
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
tree.hpp
genesis::tree::Tree::edges
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
genesis::tree::CommonEdgeData
Common class containing the commonly needed data for tree edges.
Definition: tree/common_tree/tree.hpp:144
genesis::tree::average_branch_length_tree
Tree average_branch_length_tree(std::vector< Tree > const &tset)
Return a Tree where the branch lengths are the average of the Trees in the given vector of Trees or T...
Definition: tree/common_tree/functions.cpp:242
genesis::tree::CommonNodeData
Common class containing the commonly needed data for tree nodes.
Definition: tree/common_tree/tree.hpp:79
genesis::tree::TreeEdge::data
EdgeDataType & data()
Definition: edge.hpp:217
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
genesis::tree::find_node
TreeNode const * find_node(Tree const &tree, const std::string &name, bool throw_on_failure, bool replace_underscores)
Finds a Node, given its name.
Definition: tree/common_tree/functions.cpp:84
genesis::tree::scale_all_branch_lengths
void scale_all_branch_lengths(Tree &tree, double factor)
Scale all branch lengths of a Tree by a given factor.
Definition: tree/common_tree/functions.cpp:233
genesis::tree::set_all_branch_lengths
void set_all_branch_lengths(Tree &tree, double length)
Set all branch lengths of a Tree to a given value.
Definition: tree/common_tree/functions.cpp:224
preorder.hpp
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272