A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tree/default/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
42 namespace genesis {
43 namespace tree {
44 
45 // =================================================================================================
46 // Node Names
47 // =================================================================================================
48 
49 std::unordered_set<std::string> node_names(
50  Tree const& tree,
51  bool leaves_only
52 ) {
53  std::unordered_set<std::string> name_set;
54  for( auto const& node : tree.nodes() ) {
55  if( is_inner( *node ) && leaves_only ) {
56  continue;
57  }
58  auto const name = node->data<DefaultNodeData>().name;
59  if( name == "" ) {
60  continue;
61  }
62  name_set.insert( std::move( name ));
63  }
64  return name_set;
65 }
66 
68  Tree const& tree,
69  bool leaves_only
70 ) {
72  for( auto const& node : tree.nodes() ) {
73  if( is_inner( *node ) && leaves_only ) {
74  continue;
75  }
76  auto const name = node->data<DefaultNodeData>().name;
77  if( name == "" ) {
78  continue;
79  }
80  name_set.insert( std::move( name ));
81  }
82  return name_set;
83 }
84 
85 std::unordered_set<std::string> node_names(
86  TreeSet const& tree_set,
87  bool leaves_only
88 ) {
89  // It would be faster to directly insert into the resulting container, but this version
90  // avoids code duplication and is fast enough for now.
91  std::unordered_set<std::string> name_set;
92  for( auto const& tree : tree_set ) {
93  auto tree_name_set = node_names( tree.tree, leaves_only );
94  name_set.insert( tree_name_set.begin(), tree_name_set.end() );
95  }
96  return name_set;
97 }
98 
100  TreeSet const& tree_set,
101  bool leaves_only
102 ) {
103  // It would be faster to directly insert into the resulting container, but this version
104  // avoids code duplication and is fast enough for now.
106  for( auto const& tree : tree_set ) {
107  // We can use the unsorted version here, which should be a bit faster (not tested...).
108  // Sorting is then done when inserting the names into the final set.
109  auto tree_name_set = node_names( tree.tree, leaves_only );
110  name_set.insert( tree_name_set.begin(), tree_name_set.end() );
111  }
112  return name_set;
113 }
114 
116  Tree const& tree,
117  const std::string& name,
118  bool replace_underscores
119 ) {
120  auto clean_name = name;
121  if (replace_underscores) {
122  clean_name = utils::replace_all(name, "_", " ");
123  }
124 
125  for (auto it = tree.begin_nodes(); it != tree.end_nodes(); ++it) {
126  if( it->get()->data<DefaultNodeData>().name == clean_name) {
127  return it->get();
128  }
129  }
130 
131  return nullptr;
132 }
133 
135  Tree& tree,
136  const std::string& name,
137  bool replace_underscores
138 ) {
139  // Avoid code duplication according to Scott Meyers.
140  auto const& ctree = static_cast< Tree const& >( tree );
141  return const_cast< TreeNode* >(
142  find_node( ctree, name, replace_underscores )
143  );
144 }
145 
146 // =================================================================================================
147 // Branch Length
148 // =================================================================================================
149 
150 double length(Tree const& tree)
151 {
152  double len = 0.0;
153  for( auto const& edge : tree.edges() ) {
154  len += edge->data<DefaultEdgeData>().branch_length;
155  }
156  return len;
157 }
158 
159 double height(Tree const& tree)
160 {
161  auto dists = node_branch_length_distance_vector(tree);
162  return *std::max_element(dists.begin(), dists.end());
163 }
164 
165 double diameter( Tree const& tree )
166 {
167  auto dist_mat = node_branch_length_distance_matrix( tree );
168  return *std::max_element( dist_mat.begin(), dist_mat.end() );
169 }
170 
171 std::vector<double> branch_lengths(
172  Tree const& tree
173 ) {
174  std::vector<double> result;
175  result.reserve( tree.edge_count() );
176  for( size_t i = 0; i < tree.edge_count(); ++i ) {
177  result.push_back( tree.edge_at(i).data<DefaultEdgeData>().branch_length );
178  }
179  return result;
180 }
181 
183  Tree& tree,
184  double length
185 ) {
186  for( auto& edge : tree.edges() ) {
187  edge->data<DefaultEdgeData>().branch_length = length;
188  }
189 }
190 
192  Tree& tree,
193  double factor
194 ) {
195  for( auto& edge : tree.edges() ) {
196  edge->data<DefaultEdgeData>().branch_length *= factor;
197  }
198 }
199 
201 {
202  using TreeType = Tree;
203 
204  if( tset.size() == 0 ) {
205  return TreeType();
206  }
207 
208  if( ! all_identical_topology( tset )) {
209  throw std::runtime_error( "Trees in TreeSet do not have the same topology." );
210  }
211 
212  // Prepare storage for average branch lengths.
213  size_t num_edges = tset.at(0).tree.edge_count();
214  auto avgs = std::vector<double>(num_edges, 0.0);
215 
216  // We traverse all trees (again, because all_identical_topology() already did this). This is
217  // probably a bit slower than the previous version of this method which worked with less
218  // traversals, but way easier to understand and debug.
219  for( auto& ct : tset ) {
220  // Use an index for position in the preorder traversal. This makes sure that the
221  // index actually always points to the correct edges, indepently of their order in
222  // different trees in the set.
223  size_t idx = 0;
224 
225  // Do a preorder traversal and collect branch lengths.
226  for( auto it : preorder(ct.tree) ) {
227  // The first iteration points to an edge which will be covered later again.
228  // Skip it to prevent double coverage.
229  if (it.is_first_iteration()) {
230  continue;
231  }
232 
233  avgs[idx] += it.edge().data<DefaultEdgeData>().branch_length;
234  ++idx;
235  }
236  }
237 
238  // We know that all trees have the same topology. So we take a copy of the first one
239  // (thus, also copying its node names) and modify its branch lengths.
240  TreeType tree = TreeType( tset.at(0).tree );
241 
242  // Do the same kind of traversal as before in order to keep the indexing order (preorder) and
243  // set the branch lengths.
244  size_t idx = 0;
245  for( auto it : preorder(tree) ) {
246  // The first iteration points to an edge which will be covered later again.
247  // Skip it to prevent double coverage.
248  if (it.is_first_iteration()) {
249  continue;
250  }
251 
252  it.edge().data<DefaultEdgeData>().branch_length = avgs[idx] / tset.size();
253  ++idx;
254  }
255 
256  return tree;
257 }
258 
259 } // namespace tree
260 } // namespace genesis
std::unordered_set< std::string > node_names(Tree const &tree, bool leaves_only)
Returns an unordered set of all TreeNode names of a Tree.
double height(Tree const &tree)
Get the height of the tree, i.e., the longest distance from the root to a leaf, measured using the br...
Tree operator functions.
Default class containing the commonly needed data for tree nodes.
utils::SortedVector< std::string > node_names_sorted(Tree const &tree, bool leaves_only)
Returns a set of all TreeNode names of a Tree.
TreeNode const * find_node(Tree const &tree, const std::string &name, bool replace_underscores)
Finds a Node, given its name. If not found, nullptr is returned.
std::string replace_all(std::string const &text, std::string const &search, std::string const &replace)
Return a copy of a string, where all occurrences of a search string are replaced by a replace string...
Definition: string.cpp:280
utils::Range< IteratorNodes > nodes()
Definition: tree/tree.cpp:484
Sorted vector of unique elements.
utils::Range< IteratorPreorder< TreeLink const, TreeNode const, TreeEdge const > > preorder(ElementType const &element)
IteratorNodes end_nodes()
Definition: tree/tree.cpp:469
Default class containing the commonly needed data for tree edges.
size_t size() const
Return the size of the TreeSet, i.e., the number of stored Trees.
Definition: tree_set.cpp:139
bool is_inner(TreeLink const &link)
Return true iff the node of the given link is an inner node.
utils::Range< IteratorEdges > edges()
Definition: tree/tree.cpp:518
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
utils::Matrix< double > node_branch_length_distance_matrix(Tree const &tree)
Return a distance matrix containing pairwise distances between all Nodes, using the branch_length of ...
IteratorNodes begin_nodes()
Definition: tree/tree.cpp:464
Provides some commonly used string utility functions.
void set_all_branch_lengths(Tree &tree, double length)
Set all branch lengths of a Tree to a given value.
Header of Default Tree distance methods.
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
Default Tree functions.
Tree average_branch_length_tree(TreeSet const &tset)
Return a Tree where the branch lengths are the average of the Trees in the TreeSet, given that they all have the same topology.
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.cpp:324
tree::TreeSet tree_set(SampleSet const &sample_set)
Return a TreeSet containing all the trees of the SampleSet.
Definition: sample_set.cpp:157
std::vector< double > node_branch_length_distance_vector(Tree const &tree, TreeNode const *node)
Return a vector containing the distance of all nodes with respect to the given start node...
void insert(const_reference value)
Insert a value into the container by copying it.
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
std::vector< double > branch_lengths(Tree const &tree)
Get a vector of all branch lengths of a Tree, index by the edge index.
EdgeDataType & data()
Definition: edge.hpp:183
bool all_identical_topology(TreeSet const &tset)
Compare whether all Trees in a TreeSet are equal using their default comparision operators for nodes ...
NamedTree & at(size_t index)
Definition: tree_set.cpp:108
double diameter(Tree const &tree)
Get the diameter of the tree, i.e., the longest distance between any two nodes, measured using the br...
void scale_all_branch_lengths(Tree &tree, double factor)
Scale all branch lengths of a Tree by a given factor.