A toolkit for working with phylogenetic data.
v0.24.0
tree/common_tree/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
42 #include <algorithm>
43 #include <stdexcept>
44 
45 namespace genesis {
46 namespace tree {
47 
48 // =================================================================================================
49 // Node Names
50 // =================================================================================================
51 
52 std::vector<std::string> node_names(
53  Tree const& tree,
54  bool leaves_only
55 ) {
56  std::vector<std::string> names;
57  for( auto const& node : tree.nodes() ) {
58  if( is_inner( node ) && leaves_only ) {
59  continue;
60  }
61  auto const name = node.data<CommonNodeData>().name;
62  if( name.empty() ) {
63  continue;
64  }
65  names.push_back( std::move( name ));
66  }
67  return names;
68 }
69 
70 std::vector<std::string> node_names(
71  std::vector<Tree> const& tree_set,
72  bool leaves_only
73 ) {
74  // It would be faster to directly insert into the resulting container, but this version
75  // avoids code duplication and is fast enough for now.
76  std::vector<std::string> names;
77  for( auto const& tree : tree_set ) {
78  auto tree_name_set = node_names( tree, leaves_only );
79  names.insert( names.end(), tree_name_set.begin(), tree_name_set.end() );
80  }
81  return names;
82 }
83 
85  Tree const& tree,
86  const std::string& name,
87  bool throw_on_failure,
88  bool replace_underscores
89 ) {
90  auto clean_name = name;
91  if (replace_underscores) {
92  clean_name = utils::replace_all(name, "_", " ");
93  }
94 
95  // Try to find the node, return immediately on success.
96  for( auto const& node : tree.nodes() ) {
97  if( node.data<CommonNodeData>().name == clean_name ) {
98  return &node;
99  }
100  }
101 
102  // If we are here, we did not find the not. Fail.
103  if( throw_on_failure ) {
104  throw std::invalid_argument( "Cannot find node '" + name + "' in tree." );
105  }
106  return nullptr;
107 }
108 
110  Tree& tree,
111  const std::string& name,
112  bool throw_on_failure,
113  bool replace_underscores
114 ) {
115  // Avoid code duplication according to Scott Meyers.
116  auto const& ctree = static_cast< Tree const& >( tree );
117  return const_cast< TreeNode* >(
118  find_node( ctree, name, throw_on_failure, replace_underscores )
119  );
120 }
121 
122 std::vector<TreeNode const*> find_nodes(
123  Tree const& tree,
124  std::vector<std::string> const& node_names,
125  bool throw_on_failure,
126  bool replace_underscores
127 ) {
128  // Prepare result
129  std::vector<TreeNode const*> node_list;
130  node_list.reserve( node_names.size() );
131 
132  // Find and return nodes
133  for( auto const& taxon : node_names ) {
134  node_list.push_back( find_node( tree, taxon, throw_on_failure, replace_underscores ));
135  }
136  return node_list;
137 }
138 
139 std::vector<TreeNode*> find_nodes(
140  Tree& tree,
141  std::vector<std::string> const& node_names,
142  bool throw_on_failure,
143  bool replace_underscores
144 ) {
145  // Prepare result
146  std::vector<TreeNode*> node_list;
147  node_list.reserve( node_names.size() );
148 
149  // Find and return nodes
150  for( auto const& taxon : node_names ) {
151  node_list.push_back( find_node( tree, taxon, throw_on_failure, replace_underscores ));
152  }
153  return node_list;
154 }
155 
156 // =================================================================================================
157 // Branch Length
158 // =================================================================================================
159 
160 double length(Tree const& tree)
161 {
162  double len = 0.0;
163  for( auto const& edge : tree.edges() ) {
164  len += edge.data<CommonEdgeData>().branch_length;
165  }
166  return len;
167 }
168 
169 double height(Tree const& tree)
170 {
171  if( tree.empty() ) {
172  return 0.0;
173  }
174  auto dists = node_branch_length_distance_vector(tree);
175  return *std::max_element(dists.begin(), dists.end());
176 }
177 
178 double diameter( Tree const& tree )
179 {
180  if( tree.empty() ) {
181  return 0.0;
182  }
183  auto dist_mat = node_branch_length_distance_matrix( tree );
184  assert( ! dist_mat.empty() );
185  assert( dist_mat.rows() == dist_mat.cols() );
186  return *std::max_element( dist_mat.begin(), dist_mat.end() );
187 }
188 
189 std::vector<double> branch_lengths(
190  Tree const& tree
191 ) {
192  std::vector<double> result;
193  result.reserve( tree.edge_count() );
194  for( size_t i = 0; i < tree.edge_count(); ++i ) {
195  result.push_back( tree.edge_at(i).data<CommonEdgeData>().branch_length );
196  }
197  return result;
198 }
199 
201  Tree& tree,
202  double length
203 ) {
204  for( auto& edge : tree.edges() ) {
205  edge.data<CommonEdgeData>().branch_length = length;
206  }
207 }
208 
210  Tree& tree,
211  double factor
212 ) {
213  for( auto& edge : tree.edges() ) {
214  edge.data<CommonEdgeData>().branch_length *= factor;
215  }
216 }
217 
218 Tree average_branch_length_tree( std::vector<Tree> const& tset )
219 {
220  if( tset.size() == 0 ) {
221  return Tree();
222  }
223 
224  if( ! identical_topology( tset )) {
225  throw std::runtime_error(
226  "Cannot calculate average branch length tree. Trees do not have the same topology."
227  );
228  }
229 
230  // Prepare storage for average branch lengths.
231  size_t num_edges = tset.at(0).edge_count();
232  auto avgs = std::vector<double>(num_edges, 0.0);
233 
234  // We traverse all trees (again, because all_identical_topology() already did this). This is
235  // probably a bit slower than the previous version of this method which worked with less
236  // traversals, but way easier to understand and debug.
237  for( auto& ct : tset ) {
238  // Use an index for position in the preorder traversal. This makes sure that the
239  // index actually always points to the correct edges, indepently of their order in
240  // different trees in the set.
241  size_t idx = 0;
242 
243  // Do a preorder traversal and collect branch lengths.
244  for( auto it : preorder(ct) ) {
245  // The first iteration points to an edge which will be covered later again.
246  // Skip it to prevent double coverage.
247  if (it.is_first_iteration()) {
248  continue;
249  }
250 
251  avgs[idx] += it.edge().data<CommonEdgeData>().branch_length;
252  ++idx;
253  }
254  }
255 
256  // We know that all trees have the same topology. So we take a copy of the first one
257  // (thus, also copying its node names) and modify its branch lengths.
258  auto tree = tset.at(0);
259 
260  // Do the same kind of traversal as before in order to keep the indexing order (preorder) and
261  // set the branch lengths.
262  size_t idx = 0;
263  for( auto it : preorder(tree) ) {
264  // The first iteration points to an edge which will be covered later again.
265  // Skip it to prevent double coverage.
266  if (it.is_first_iteration()) {
267  continue;
268  }
269 
270  it.edge().data<CommonEdgeData>().branch_length = avgs[idx] / tset.size();
271  ++idx;
272  }
273 
274  return tree;
275 }
276 
277 } // namespace tree
278 } // namespace genesis
std::vector< double > branch_lengths(Tree const &tree)
Get a vector of all branch lengths of a Tree, index by the edge index.
TreeNode const * find_node(Tree const &tree, const std::string &name, bool throw_on_failure, bool replace_underscores)
Finds a Node, given its name.
bool is_inner(TreeLink const &link)
Return true iff the node of the given link is an inner node.
Header of CommonTree distance methods.
Tree operator functions.
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
CommonTree functions.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
Tree average_branch_length_tree(std::vector< Tree > const &tset)
Return a Tree where the branch lengths are the average of the Trees in the given vector of Trees or T...
std::vector< TreeNode const * > find_nodes(Tree const &tree, std::vector< std::string > const &node_names, bool throw_on_failure, bool replace_underscores)
Find TreeNodes in a Tree, given their name.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
tree::TreeSet tree_set(SampleSet const &sample_set)
Return a TreeSet containing all the trees of the SampleSet.
Definition: sample_set.cpp:156
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string...
Definition: string.cpp:322
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.hpp:418
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
std::vector< double > node_branch_length_distance_vector(Tree const &tree, TreeNode const *node)
Return a vector containing the distance of all nodes with respect to the given start node...
double height(Tree const &tree)
Get the height of the tree, i.e., the longest distance from the root to a leaf, measured using the br...
utils::Range< IteratorEdges > edges()
Definition: tree/tree.hpp:452
Provides some commonly used string utility functions.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
void set_all_branch_lengths(Tree &tree, double length)
Set all branch lengths of a Tree to a given value.
std::string name
Name of the node.
bool identical_topology(Tree const &lhs, Tree const &rhs, bool identical_indices)
Return whether both trees have an identical topology.
utils::Range< IteratorPreorder< true > > preorder(ElementType const &element)
double diameter(Tree const &tree)
Get the diameter of the tree, i.e., the longest distance between any two nodes, measured using the br...
Common class containing the commonly needed data for tree nodes.
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Common class containing the commonly needed data for tree edges.
EdgeDataType & data()
Definition: edge.hpp:217
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
std::vector< std::string > node_names(Tree const &tree, bool leaves_only)
Returns a list of all TreeNode names of a Tree.
void scale_all_branch_lengths(Tree &tree, double factor)
Scale all branch lengths of a Tree by a given factor.