A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/tree.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 #include "genesis/tree/tree.hpp"
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <numeric>
44 #include <stdexcept>
45 #include <utility>
46 #include <vector>
47 
48 namespace genesis {
49 namespace placement {
50 
51 // =================================================================================================
52 // Helper Functions
53 // =================================================================================================
54 
56  Sample const& sample,
57  bool const fully_resolve,
58  std::string const& name_prefix
59 ) {
60  // Get a copy of the original tree that contains only default data.
61  return labelled_tree(
62  sample,
64  fully_resolve,
65  name_prefix
66  );
67 }
68 
70  Sample const& sample,
71  tree::Tree const& tree,
72  bool fully_resolve,
73  std::string const& name_prefix
74 ) {
75  // -----------------------------------------------------
76  // Preparation
77  // -----------------------------------------------------
78 
79  // Get a copy of the original tree that we can add edges to.
80  auto result = tree;
81 
82  // Check whether the tree is compatible with the sample tree.
83  // This is a bit wasteful for cases when this function is called directly from its sample
84  // version above (then, we can be sure that it is compatible), but necessary otherwise.
85  if( ! tree::identical_topology( result, sample.tree() )) {
86  throw std::runtime_error(
87  "Tree provided for labelled_tree() is not compatible "
88  "with the Tree of the provided Sample."
89  );
90  }
91 
92  // Get a list of the edge pointers so that we can iterate and add to the tree at the same time.
93  std::vector< tree::TreeEdge* > edge_list;
94  edge_list.reserve( result.edge_count() );
95  for( auto const& edge : result.edges() ) {
96  edge_list.push_back( edge.get() );
97  }
98  assert( edge_list.size() == sample.tree().edge_count() );
99 
100  // Helper struct that stores a placement along with its containing pquery.
101  struct PlacementPair
102  {
103  PlacementPair( Pquery const* pq, PqueryPlacement const* pp )
104  : pquery( pq )
105  , placement( pp )
106  {}
107 
108  Pquery const* pquery;
109  PqueryPlacement const* placement;
110  };
111 
112  // For each edge, get a list of placements - but only the most probable ones per Pquery.
113  // This is similar to placements_per_edge(), but we also need a pointer to the pquery, in
114  // order to get its name(s).
115  auto placement_pairs_per_edge = []( Sample const& sample ){
116  std::vector< std::vector< PlacementPair >> place_map;
117  place_map.resize( sample.tree().edge_count() );
118  for( auto const& pqry : sample.pqueries() ) {
119 
120  // Find the most probable placement.
121  auto max_it = std::max_element(
122  pqry.placements().begin(),
123  pqry.placements().end(),
124  []( PqueryPlacement const& lhs, PqueryPlacement const& rhs ){
125  return lhs.like_weight_ratio < rhs.like_weight_ratio;
126  }
127  );
128 
129  // If there is one (i.e., the pquery has at least one placement),
130  // add it to the list for its edge.
131  if( max_it != pqry.placements().end() ) {
132  place_map[ max_it->edge().index() ].emplace_back( &pqry, &*max_it );
133  }
134 
135  // Alternative handcrafted version...
136  // PqueryPlacement const* max_p = nullptr;
137  // double max_v = std::numeric_limits<double>::lowest();
138  // for( auto const& place : pqry.placements() ) {
139  // if( place.like_weight_ratio > max_v ) {
140  // max_v = place.like_weight_ratio;
141  // max_p = &place;
142  // }
143  // }
144  // if( max_p ) {
145  // place_map[ max_p->edge().index() ].emplace_back( &pqry, max_p );
146  // }
147  }
148  return place_map;
149  };
150  auto place_map = placement_pairs_per_edge( sample );
151  assert( place_map.size() == edge_list.size() );
152 
153  // Sort the placements per edge, so that we can attach them to the tree in order.
154  // For a fully resolved tree, we want to sort by proximal length, as this is the order in which
155  // the placements will form new nodes on the tree.
156  // For a multifurcating tree, we sort by pendant length, so that the short ones come first.
157  auto sort_placements = fully_resolve
158  ? []( PlacementPair const& lhs, PlacementPair const& rhs ){
159  return lhs.placement->proximal_length < rhs.placement->proximal_length;
160  }
161  : []( PlacementPair const& lhs, PlacementPair const& rhs ){
162  return lhs.placement->pendant_length < rhs.placement->pendant_length;
163  }
164  ;
165  for( auto& edge_l : place_map ) {
166  std::sort( edge_l.begin(), edge_l.end(), sort_placements );
167  }
168 
169  // -----------------------------------------------------
170  // Single Placement on the Edge
171  // -----------------------------------------------------
172 
173  // Helper function that adds one placement, if it is the only one on its edge.
174  auto add_lonely_placement = [ &name_prefix ](
175  tree::Tree& tree,
176  tree::TreeEdge& edge,
177  PlacementPair const& placement_pair
178  ) {
179  // Add the new edges to the tree and get all necessary edges.
180  auto& pendant_edge = tree::add_new_leaf_node( tree, edge );
181  auto& proximal_edge = pendant_edge.primary_link().next().edge();
182  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
183 
184  // The primary node is new, so it should be bifurcating and a leaf.
185  assert( degree( pendant_edge.primary_node() ) == 3 );
186  assert( is_leaf( pendant_edge.secondary_node() ));
187 
188  // Some shorthands to the edge data.
189  auto& pend_data = pendant_edge.data<tree::DefaultEdgeData>();
190  auto& prox_data = proximal_edge.data<tree::DefaultEdgeData>();
191  auto& dist_data = distal_edge.data<tree::DefaultEdgeData>();
192 
193  // The pendant and distal edges are new, so they should have default branch lengths.
194  assert( pend_data.branch_length == 0.0 );
195  assert( dist_data.branch_length == 0.0 );
196 
197  // Set all three branch lengths. Make sure they are not negative.
198  double const distal_len = prox_data.branch_length - placement_pair.placement->proximal_length;
199  pend_data.branch_length = placement_pair.placement->pendant_length;
200  prox_data.branch_length = placement_pair.placement->proximal_length;
201  dist_data.branch_length = std::max( 0.0, distal_len );
202 
203  // Set the leaf node name, if there is one. Also, assert that ther is at most one -
204  // otherwise, this is not a lonely placement any more!
205  assert( placement_pair.pquery->name_size() <= 1 );
206  if( placement_pair.pquery->name_size() == 1 ) {
207  auto& pendant_node_data = pendant_edge.secondary_node().data<tree::DefaultNodeData>();
208  pendant_node_data.name = name_prefix + placement_pair.pquery->name_at(0).name;
209  }
210  };
211 
212  // -----------------------------------------------------
213  // Multiple Placements, Fully Resolved
214  // -----------------------------------------------------
215 
216  // Helper function that adds all placements for one edge, fully resolved.
217  auto process_edge_fully_resolved = [ &name_prefix ] (
218  tree::Tree& tree,
219  tree::TreeEdge& edge,
220  std::vector<PlacementPair> const& placement_pairs
221  ) {
222  // In each step, we add a new node, along the original edge, splitting it more and more.
223  // The branch lengths of those fragments are the differences between the proximal lengths
224  // of the placements.
225  // To handle this, store the proximal length that we already used along the branch.
226  double used_length = 0.0;
227 
228  // Also, we need the original branch length, because it will be lost in the process,
229  // but later needed.
230  auto const orig_length = edge.data<tree::DefaultEdgeData>().branch_length;
231 
232  // Store an edge pointer at which we insert the next placement edge.
233  auto insertion_edge = &edge;
234 
235  // Process all placements.
236  for( auto const& placement_pair : placement_pairs ) {
237  // Each name gets its own branch.
238  for( auto const& pquery_name : placement_pair.pquery->names() ) {
239 
240  // Create the new edges.
241  auto& pendant_edge = tree::add_new_leaf_node( tree, *insertion_edge );
242  auto& proximal_edge = pendant_edge.primary_link().next().edge();
243  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
244 
245  // The primary node is new, so it should be bifurcating and a leaf.
246  assert( degree( pendant_edge.primary_node() ) == 3 );
247  assert( is_leaf( pendant_edge.secondary_node() ));
248 
249  // Some shorthands to the edge data. We dont need dist data; its branch length
250  // will be set in a later step. Either it becomes the prox edge in the next
251  // iteration, or it is the final edge after the loop.
252  auto& pend_data = pendant_edge.data<tree::DefaultEdgeData>();
253  auto& prox_data = proximal_edge.data<tree::DefaultEdgeData>();
254 
255  // The pendant and distal edges are new, so they should have default branch lengths.
256  assert( pend_data.branch_length == 0.0 );
257  assert( distal_edge.data<tree::DefaultEdgeData>().branch_length == 0.0 );
258 
259  // We sorted the placements by prox len. Thus, this should hold.
260  assert( placement_pair.placement->proximal_length >= used_length );
261 
262  // Set branch properties.
263  pend_data.branch_length = placement_pair.placement->pendant_length;
264  prox_data.branch_length = placement_pair.placement->proximal_length - used_length;
265 
266  // Set the leaf name.
267  auto& node_data = pendant_edge.secondary_node().data<tree::DefaultNodeData>();
268  node_data.name = name_prefix + pquery_name.name;
269 
270  // Set new values for the keeping-track variables, for the next iteration.
271  used_length = placement_pair.placement->proximal_length;
272  insertion_edge = &distal_edge;
273  }
274  }
275 
276  // Now, the insertion edge is the last remaining edge part towards the original
277  // end of the edge. Its branch length is not yet set. Assert this, then set it to what
278  // is left of it.
279  // If the remaining length is however smaller than the original one, that means we have
280  // placements on this edge with a proximal length greater than the original edge length.
281  // This is a bit weird in terms of placement, but might happen due to BLO in the placement.
282  // In this case, we want to avoid negative length, so simply set it to zero.
283  // That means that in total the edge now grew due to its placements. So the tree is not
284  // optimized any more - but for the purposes of this function, this should be okay.
285  auto& end_edge_data = insertion_edge->data<tree::DefaultEdgeData>();
286  assert( end_edge_data.branch_length == 0.0 );
287  if( used_length < orig_length ) {
288  end_edge_data.branch_length = orig_length - used_length;
289  }
290  };
291 
292  // -----------------------------------------------------
293  // Multiple Placements, Multifurcating
294  // -----------------------------------------------------
295 
296  // Helper function that adds all placements for one edge, multifurcating,
297  // if there is more than one placement for the edge.
298  auto process_edge_multifurcating = [ &name_prefix ] (
299  tree::Tree& tree,
300  tree::TreeEdge& edge,
301  std::vector<PlacementPair> const& placement_pairs
302  ) {
303  // Add a new leaf node to the tree, attached to the middle of the given edge (this spltis
304  // the edge in half and adds one more node there). This will be the base to which we then
305  // attach all further nodes, multifurcating.
306  auto& base_edge = tree::add_new_leaf_node( tree, edge );
307  auto& pri_edge = base_edge.primary_link().next().edge();
308  auto& sec_edge = base_edge.primary_link().next().next().edge();
309 
310  // The base node is new, so it should be bifurcating and a leaf
311  assert( degree( base_edge.primary_node() ) == 3 );
312  assert( is_leaf( base_edge.secondary_node() ));
313 
314  // Some shorthands to the edge data.
315  auto& base_data = base_edge.data<tree::DefaultEdgeData>();
316  auto& pri_data = pri_edge.data<tree::DefaultEdgeData>();
317  auto& sec_data = sec_edge.data<tree::DefaultEdgeData>();
318 
319  // The pendant and distal edges are new, so they should have default branch lengths.
320  assert( base_data.branch_length == 0.0 );
321  assert( sec_data.branch_length == 0.0 );
322 
323  // Calculate the average attachment position (proximal length) of all placements.
324  auto const avg_prox_len = std::accumulate(
325  placement_pairs.begin(),
326  placement_pairs.end(),
327  0.0,
328  []( double in, PlacementPair const& cur ){
329  return in + cur.placement->proximal_length;
330  }
331  ) / static_cast<double>( placement_pairs.size() );
332 
333  // Set the branch lengths of the two splitted edges (the ones that were one edge before)
334  // so that they each take their share of the original branch length, divided using
335  // the average proximal length of the placements. This way, the multifurcation that contains
336  // the placements will sit at the sweet spot.
337  // Set the second one first, as it uses the former length of the first one (i.e., the
338  // original branch length).
339  // Also, make sure that each new length is non-negative. This might happen if
340  // the placements are somehow weirdly placed.
341  sec_data.branch_length = std::max( 0.0, pri_data.branch_length - avg_prox_len );
342  pri_data.branch_length = std::max( 0.0, avg_prox_len );
343 
344  // This function should only be called with at least one placement. Assert this.
345  // Then, we can safely get the placement with the smallest pendant length.
346  assert( placement_pairs.size() > 0 );
347  auto const min_pen_len = std::max( 0.0, std::min_element(
348  placement_pairs.begin(),
349  placement_pairs.end(),
350  []( PlacementPair const& lhs, PlacementPair const& rhs ){
351  return lhs.placement->pendant_length < rhs.placement->pendant_length;
352  }
353  )->placement->pendant_length );
354 
355  // TODO we sorted them first, so taking the first placement should also work:
356  // (if so, we can drop the above search)
357  assert( min_pen_len == placement_pairs[0].placement->pendant_length );
358 
359  // Use the min bl for the multifurcation base edge.
360  base_data.branch_length = min_pen_len;
361 
362  // Now add all placements as edges to the base node (secondary node of the base edge).
363  for( auto const& placement_pair : placement_pairs ) {
364  // Each name gets its own branch.
365  for( auto const& pquery_name : placement_pair.pquery->names() ) {
366 
367  // Make a new leaf node for this placement.
368  auto& p_edge = tree::add_new_node( tree, base_edge.secondary_node() );
369 
370  // Set the pendant branch length. It is subtracted by
371  // the min pen length, as this is already incorporated into the base's bl.
372  auto& p_data = p_edge.data<tree::DefaultEdgeData>();
373  assert( placement_pair.placement->pendant_length >= min_pen_len );
374  p_data.branch_length = placement_pair.placement->pendant_length - min_pen_len;
375 
376  // Set the leaf node name.
377  auto& p_node_data = p_edge.secondary_node().data<tree::DefaultNodeData>();
378  p_node_data.name = name_prefix + pquery_name.name;
379  }
380  }
381  };
382 
383  // -----------------------------------------------------
384  // Main Loop over Edges
385  // -----------------------------------------------------
386 
387  // Process each edge.
388  for( size_t edge_i = 0; edge_i < edge_list.size(); ++edge_i ) {
389  // Some safety. There needs to be a valid pointer to an edge.
390  assert( edge_list[ edge_i ] );
391 
392  // Nothing to do if there are no placements for this edge.
393  if( place_map[ edge_i ].size() == 0 ) {
394  continue;
395  }
396 
397  // If there is only one placement with at most one name,
398  // both algorithms behave the same, so shortcut this.
399  if( place_map[ edge_i ].size() == 1 && place_map[ edge_i ][0].pquery->name_size() <= 1 ) {
400  assert( place_map[ edge_i ][0].pquery && place_map[ edge_i ][0].placement );
401  add_lonely_placement( result, *edge_list[ edge_i ], place_map[ edge_i ][0] );
402  continue;
403  }
404 
405  // Select which algorithm to use for more than one placement at the edge.
406  if( fully_resolve ) {
407  process_edge_fully_resolved(
408  result, *edge_list[ edge_i ], place_map[ edge_i ]
409  );
410  } else {
411  process_edge_multifurcating(
412  result, *edge_list[ edge_i ], place_map[ edge_i ]
413  );
414  }
415  }
416 
417  return result;
418 }
419 
420 } // namespace placement
421 } // namespace genesis
DefaultTree convert_to_default_tree(Tree const &source_tree)
Convert a Tree to a DefaultTree with DefaultNodeData and DefaultEdgeData.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
TreeEdge & add_new_node(Tree &tree, TreeNode &target_node)
Add a new Node as a leaf to an existing Node.
size_t degree(TreeNode const &node)
Return the degree of the node, i.e. how many neighbouring nodes it has.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
Tree operator functions.
Default class containing the commonly needed data for tree nodes.
TreeEdge & add_new_leaf_node(Tree &tree, TreeEdge &target_edge)
Add a new Node as a leaf to an existing Edge, by also adding a new Node in the middle of that Edge...
One placement position of a Pquery on a Tree.
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Default class containing the commonly needed data for tree edges.
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
std::string name
Name of the node.
Header of Tree class.
tree::Tree labelled_tree(Sample const &sample, bool const fully_resolve, std::string const &name_prefix)
Produce a Tree where the most probable PqueryPlacement of each Pquery in a Sample is turned into an E...
bool identical_topology(Tree const &lhs, Tree const &rhs)
Returns true iff both trees have an identical topology.
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:183