A toolkit for working with phylogenetic data.
v0.24.0
placement/function/tree.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 #include "genesis/tree/tree.hpp"
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <numeric>
44 #include <stdexcept>
45 #include <utility>
46 #include <vector>
47 
48 namespace genesis {
49 namespace placement {
50 
51 // =================================================================================================
52 // Helper Functions
53 // =================================================================================================
54 
56  Sample const& sample,
57  bool const fully_resolve,
58  std::string const& name_prefix
59 ) {
60  // Get a copy of the original tree that contains only common data.
61  return labelled_tree(
62  sample,
64  fully_resolve,
65  name_prefix
66  );
67 }
68 
70  Sample const& sample,
71  tree::Tree const& tree,
72  bool fully_resolve,
73  std::string const& name_prefix
74 ) {
75  // -----------------------------------------------------
76  // Preparation
77  // -----------------------------------------------------
78 
79  // Get a copy of the original tree that we can add edges to.
80  auto result = tree;
81 
82  // Check whether the tree is compatible with the sample tree.
83  // This is a bit wasteful for cases when this function is called directly from its sample
84  // version above (then, we can be sure that it is compatible), but necessary otherwise.
85  if( ! tree::identical_topology( result, sample.tree() )) {
86  throw std::runtime_error(
87  "Tree provided for labelled_tree() is not compatible "
88  "with the Tree of the provided Sample."
89  );
90  }
91 
92  // Get a list of the edge pointers so that we can iterate and add to the tree at the same time.
93  std::vector< tree::TreeEdge* > edge_list;
94  edge_list.reserve( result.edge_count() );
95  for( auto& edge : result.edges() ) {
96  edge_list.push_back( &edge );
97  }
98  assert( edge_list.size() == sample.tree().edge_count() );
99 
100  // Helper struct that stores a placement along with its containing pquery.
101  struct PlacementPair
102  {
103  PlacementPair( Pquery const* pq, PqueryPlacement const* pp )
104  : pquery( pq )
105  , placement( pp )
106  {}
107 
108  Pquery const* pquery;
109  PqueryPlacement const* placement;
110  };
111 
112  // For each edge, get a list of placements - but only the most probable ones per Pquery.
113  // This is similar to placements_per_edge(), but we also need a pointer to the pquery, in
114  // order to get its name(s).
115  auto placement_pairs_per_edge = []( Sample const& sample ){
116  std::vector< std::vector< PlacementPair >> place_map;
117  place_map.resize( sample.tree().edge_count() );
118  for( auto const& pqry : sample.pqueries() ) {
119 
120  // Find the most probable placement.
121  auto max_it = std::max_element(
122  pqry.placements().begin(),
123  pqry.placements().end(),
124  []( PqueryPlacement const& lhs, PqueryPlacement const& rhs ){
125  return lhs.like_weight_ratio < rhs.like_weight_ratio;
126  }
127  );
128 
129  // If there is one (i.e., the pquery has at least one placement),
130  // add it to the list for its edge.
131  if( max_it != pqry.placements().end() ) {
132  place_map[ max_it->edge().index() ].emplace_back( &pqry, &*max_it );
133  }
134 
135  // Alternative handcrafted version...
136  // PqueryPlacement const* max_p = nullptr;
137  // double max_v = std::numeric_limits<double>::lowest();
138  // for( auto const& place : pqry.placements() ) {
139  // if( place.like_weight_ratio > max_v ) {
140  // max_v = place.like_weight_ratio;
141  // max_p = &place;
142  // }
143  // }
144  // if( max_p ) {
145  // place_map[ max_p->edge().index() ].emplace_back( &pqry, max_p );
146  // }
147  }
148  return place_map;
149  };
150  auto place_map = placement_pairs_per_edge( sample );
151  assert( place_map.size() == edge_list.size() );
152 
153  // Sort the placements per edge, so that we can attach them to the tree in order.
154  // For a fully resolved tree, we want to sort by proximal length, as this is the order in which
155  // the placements will form new nodes on the tree.
156  // For a multifurcating tree, we sort by pendant length, so that the short ones come first.
157  auto sort_placements = fully_resolve
158  ? []( PlacementPair const& lhs, PlacementPair const& rhs ){
159  return lhs.placement->proximal_length < rhs.placement->proximal_length;
160  }
161  : []( PlacementPair const& lhs, PlacementPair const& rhs ){
162  return lhs.placement->pendant_length < rhs.placement->pendant_length;
163  }
164  ;
165  for( auto& edge_l : place_map ) {
166  std::sort( edge_l.begin(), edge_l.end(), sort_placements );
167  }
168 
169  // -----------------------------------------------------
170  // Single Placement on the Edge
171  // -----------------------------------------------------
172 
173  // Helper function that adds one placement, if it is the only one on its edge.
174  auto add_lonely_placement = [ &name_prefix ](
175  tree::Tree& tree,
176  tree::TreeEdge& edge,
177  PlacementPair const& placement_pair
178  ) {
179  // Add the new edges to the tree and get all necessary edges.
180  auto& pendant_node = tree::add_new_leaf_node( tree, edge );
181  auto& pendant_edge = pendant_node.link().edge();
182  auto& proximal_edge = pendant_edge.primary_link().next().edge();
183  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
184 
185  // The inner node is new, so it should be bifurcating. The other one is a leaf.
186  assert( degree( pendant_edge.primary_node() ) == 3 );
187  assert( is_leaf( pendant_edge.secondary_node() ));
188 
189  // Some shorthands to the edge data.
190  auto& pend_data = pendant_edge.data<tree::CommonEdgeData>();
191  auto& prox_data = proximal_edge.data<tree::CommonEdgeData>();
192  auto& dist_data = distal_edge.data<tree::CommonEdgeData>();
193 
194  // The pendant and distal edges are new, so they should have default branch lengths.
195  assert( pend_data.branch_length == 0.0 );
196  assert( dist_data.branch_length == 0.0 );
197 
198  // Set all three branch lengths. Make sure they are not negative.
199  double const distal_len = prox_data.branch_length - placement_pair.placement->proximal_length;
200  pend_data.branch_length = placement_pair.placement->pendant_length;
201  prox_data.branch_length = placement_pair.placement->proximal_length;
202  dist_data.branch_length = std::max( 0.0, distal_len );
203 
204  // Set the leaf node name, if there is one. Also, assert that ther is at most one -
205  // otherwise, this is not a lonely placement any more!
206  assert( placement_pair.pquery->name_size() <= 1 );
207  if( placement_pair.pquery->name_size() == 1 ) {
208  auto& pendant_node_data = pendant_node.data<tree::CommonNodeData>();
209  pendant_node_data.name = name_prefix + placement_pair.pquery->name_at(0).name;
210  }
211  };
212 
213  // -----------------------------------------------------
214  // Multiple Placements, Fully Resolved
215  // -----------------------------------------------------
216 
217  // Helper function that adds all placements for one edge, fully resolved.
218  auto process_edge_fully_resolved = [ &name_prefix ] (
219  tree::Tree& tree,
220  tree::TreeEdge& edge,
221  std::vector<PlacementPair> const& placement_pairs
222  ) {
223  // In each step, we add a new node, along the original edge, splitting it more and more.
224  // The branch lengths of those fragments are the differences between the proximal lengths
225  // of the placements.
226  // To handle this, store the proximal length that we already used along the branch.
227  double used_length = 0.0;
228 
229  // Also, we need the original branch length, because it will be lost in the process,
230  // but later needed.
231  auto const orig_length = edge.data<tree::CommonEdgeData>().branch_length;
232 
233  // Store an edge pointer at which we insert the next placement edge.
234  auto insertion_edge = &edge;
235 
236  // Process all placements.
237  for( auto const& placement_pair : placement_pairs ) {
238  // Each name gets its own branch.
239  for( auto const& pquery_name : placement_pair.pquery->names() ) {
240 
241  // Create the new edges.
242  auto& pendant_node = tree::add_new_leaf_node( tree, *insertion_edge );
243  auto& pendant_edge = pendant_node.link().edge();
244  auto& proximal_edge = pendant_edge.primary_link().next().edge();
245  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
246 
247  // The primary node is new, so it should be bifurcating and a leaf.
248  assert( degree( pendant_edge.primary_node() ) == 3 );
249  assert( is_leaf( pendant_edge.secondary_node() ));
250 
251  // Some shorthands to the edge data. We dont need dist data; its branch length
252  // will be set in a later step. Either it becomes the prox edge in the next
253  // iteration, or it is the final edge after the loop.
254  auto& pend_data = pendant_edge.data<tree::CommonEdgeData>();
255  auto& prox_data = proximal_edge.data<tree::CommonEdgeData>();
256 
257  // The pendant and distal edges are new, so they should have default branch lengths.
258  assert( pend_data.branch_length == 0.0 );
259  assert( distal_edge.data<tree::CommonEdgeData>().branch_length == 0.0 );
260 
261  // We sorted the placements by prox len. Thus, this should hold.
262  assert( placement_pair.placement->proximal_length >= used_length );
263 
264  // Set branch properties.
265  pend_data.branch_length = placement_pair.placement->pendant_length;
266  prox_data.branch_length = placement_pair.placement->proximal_length - used_length;
267 
268  // Set the leaf name.
269  auto& node_data = pendant_node.data<tree::CommonNodeData>();
270  node_data.name = name_prefix + pquery_name.name;
271 
272  // Set new values for the keeping-track variables, for the next iteration.
273  used_length = placement_pair.placement->proximal_length;
274  insertion_edge = &distal_edge;
275  }
276  }
277 
278  // Now, the insertion edge is the last remaining edge part towards the original
279  // end of the edge. Its branch length is not yet set. Assert this, then set it to what
280  // is left of it.
281  // If the remaining length is however smaller than the original one, that means we have
282  // placements on this edge with a proximal length greater than the original edge length.
283  // This is a bit weird in terms of placement, but might happen due to BLO in the placement.
284  // In this case, we want to avoid negative length, so simply set it to zero.
285  // That means that in total the edge now grew due to its placements. So the tree is not
286  // optimized any more - but for the purposes of this function, this should be okay.
287  auto& end_edge_data = insertion_edge->data<tree::CommonEdgeData>();
288  assert( end_edge_data.branch_length == 0.0 );
289  if( used_length < orig_length ) {
290  end_edge_data.branch_length = orig_length - used_length;
291  }
292  };
293 
294  // -----------------------------------------------------
295  // Multiple Placements, Multifurcating
296  // -----------------------------------------------------
297 
298  // Helper function that adds all placements for one edge, multifurcating,
299  // if there is more than one placement for the edge.
300  auto process_edge_multifurcating = [ &name_prefix ] (
301  tree::Tree& tree,
302  tree::TreeEdge& edge,
303  std::vector<PlacementPair> const& placement_pairs
304  ) {
305  // Add a new leaf node to the tree, attached to the middle of the given edge (this spltis
306  // the edge in half and adds one more node there). This will be the base to which we then
307  // attach all further nodes, multifurcating.
308  auto& new_node = tree::add_new_leaf_node( tree, edge );
309  auto& base_edge = new_node.link().edge();
310  auto& pri_edge = base_edge.primary_link().next().edge();
311  auto& sec_edge = base_edge.primary_link().next().next().edge();
312 
313  // The base node is new, so it should be bifurcating and a leaf
314  assert( degree( base_edge.primary_node() ) == 3 );
315  assert( is_leaf( base_edge.secondary_node() ));
316 
317  // Some shorthands to the edge data.
318  auto& base_data = base_edge.data<tree::CommonEdgeData>();
319  auto& pri_data = pri_edge.data<tree::CommonEdgeData>();
320  auto& sec_data = sec_edge.data<tree::CommonEdgeData>();
321 
322  // The pendant and distal edges are new, so they should have default branch lengths.
323  assert( base_data.branch_length == 0.0 );
324  assert( sec_data.branch_length == 0.0 );
325 
326  // Calculate the average attachment position (proximal length) of all placements.
327  auto const avg_prox_len = std::accumulate(
328  placement_pairs.begin(),
329  placement_pairs.end(),
330  0.0,
331  []( double in, PlacementPair const& cur ){
332  return in + cur.placement->proximal_length;
333  }
334  ) / static_cast<double>( placement_pairs.size() );
335 
336  // Set the branch lengths of the two splitted edges (the ones that were one edge before)
337  // so that they each take their share of the original branch length, divided using
338  // the average proximal length of the placements. This way, the multifurcation that contains
339  // the placements will sit at the sweet spot.
340  // Set the second one first, as it uses the former length of the first one (i.e., the
341  // original branch length).
342  // Also, make sure that each new length is non-negative. This might happen if
343  // the placements are somehow weirdly placed.
344  sec_data.branch_length = std::max( 0.0, pri_data.branch_length - avg_prox_len );
345  pri_data.branch_length = std::max( 0.0, avg_prox_len );
346 
347  // This function should only be called with at least one placement. Assert this.
348  // Then, we can safely get the placement with the smallest pendant length.
349  assert( placement_pairs.size() > 0 );
350  auto const min_pen_len = std::max( 0.0, std::min_element(
351  placement_pairs.begin(),
352  placement_pairs.end(),
353  []( PlacementPair const& lhs, PlacementPair const& rhs ){
354  return lhs.placement->pendant_length < rhs.placement->pendant_length;
355  }
356  )->placement->pendant_length );
357 
358  // TODO we sorted them first, so taking the first placement should also work:
359  // (if so, we can drop the above search)
360  assert( min_pen_len == placement_pairs[0].placement->pendant_length );
361 
362  // Use the min bl for the multifurcation base edge.
363  base_data.branch_length = min_pen_len;
364 
365  // Now add all placements as edges to the base node (secondary node of the base edge).
366  for( auto const& placement_pair : placement_pairs ) {
367  // Each name gets its own branch.
368  for( auto const& pquery_name : placement_pair.pquery->names() ) {
369 
370  // Make a new leaf node for this placement.
371  auto& p_node = tree::add_new_node( tree, new_node );
372  auto& p_edge = p_node.link().edge();
373 
374  // Set the pendant branch length. It is subtracted by
375  // the min pen length, as this is already incorporated into the base's bl.
376  auto& p_data = p_edge.data<tree::CommonEdgeData>();
377  assert( placement_pair.placement->pendant_length >= min_pen_len );
378  p_data.branch_length = placement_pair.placement->pendant_length - min_pen_len;
379 
380  // Set the leaf node name.
381  auto& p_node_data = p_node.data<tree::CommonNodeData>();
382  p_node_data.name = name_prefix + pquery_name.name;
383  }
384  }
385  };
386 
387  // -----------------------------------------------------
388  // Main Loop over Edges
389  // -----------------------------------------------------
390 
391  // Process each edge.
392  for( size_t edge_i = 0; edge_i < edge_list.size(); ++edge_i ) {
393  // Some safety. There needs to be a valid pointer to an edge.
394  assert( edge_list[ edge_i ] );
395 
396  // Nothing to do if there are no placements for this edge.
397  if( place_map[ edge_i ].size() == 0 ) {
398  continue;
399  }
400 
401  // If there is only one placement with at most one name,
402  // both algorithms behave the same, so shortcut this.
403  if( place_map[ edge_i ].size() == 1 && place_map[ edge_i ][0].pquery->name_size() <= 1 ) {
404  assert( place_map[ edge_i ][0].pquery && place_map[ edge_i ][0].placement );
405  add_lonely_placement( result, *edge_list[ edge_i ], place_map[ edge_i ][0] );
406  continue;
407  }
408 
409  // Select which algorithm to use for more than one placement at the edge.
410  if( fully_resolve ) {
411  process_edge_fully_resolved(
412  result, *edge_list[ edge_i ], place_map[ edge_i ]
413  );
414  } else {
415  process_edge_multifurcating(
416  result, *edge_list[ edge_i ], place_map[ edge_i ]
417  );
418  }
419  }
420 
421  return result;
422 }
423 
424 } // namespace placement
425 } // namespace genesis
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
Tree operator functions.
tree::Tree labelled_tree(Sample const &sample, bool const fully_resolve, std::string const &name_prefix)
Produce a Tree where the most probable PqueryPlacement of each Pquery in a Sample is turned into an E...
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
One placement position of a Pquery on a Tree.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
CommonTree convert_to_common_tree(Tree const &source_tree)
Convert a Tree to a CommonTree with CommonNodeData and CommonEdgeData.
TreeNode & add_new_leaf_node(Tree &tree, TreeEdge &target_edge, std::function< void(TreeEdge &target_edge, TreeEdge &new_edge)> adjust_edges)
Add a new Node as a leaf to an existing Edge, by also adding a new Node in the middle of that Edge...
std::string name
Name of the node.
TreeNode & add_new_node(Tree &tree, TreeNode &target_node)
Add a new Node as a leaf to an existing Node.
double branch_length
Branch length of the edge.
bool identical_topology(Tree const &lhs, Tree const &rhs, bool identical_indices)
Return whether both trees have an identical topology.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
Common class containing the commonly needed data for tree nodes.
Header of Tree class.
Common class containing the commonly needed data for tree edges.
size_t degree(TreeLink const &link)
Return the degree of the node for a given TreeLink, i.e. how many neighbouring nodes it has...
double like_weight_ratio
Likelihood weight ratio of this placement.
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.