A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
placement/function/tree.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 #include "genesis/tree/tree.hpp"
39 
40 #include <algorithm>
41 #include <cassert>
42 #include <numeric>
43 #include <stdexcept>
44 #include <utility>
45 #include <vector>
46 
47 namespace genesis {
48 namespace placement {
49 
50 // =================================================================================================
51 // Helper Functions
52 // =================================================================================================
53 
55  Sample const& sample,
56  bool const fully_resolve,
57  std::string const& name_prefix
58 ) {
59  // Get a copy of the original tree that contains only default data.
60  return labelled_tree(
61  sample,
63  fully_resolve,
64  name_prefix
65  );
66 }
67 
69  Sample const& sample,
70  tree::Tree const& tree,
71  bool fully_resolve,
72  std::string const& name_prefix
73 ) {
74  // -----------------------------------------------------
75  // Preparation
76  // -----------------------------------------------------
77 
78  // Get a copy of the original tree that we can add edges to.
79  auto result = tree;
80 
81  // Check whether the tree is compatible with the sample tree.
82  // This is a bit wasteful for cases when this function is called directly from its sample
83  // version above (then, we can be sure that it is compatible), but necessary otherwise.
84  if( ! tree::identical_topology( result, sample.tree() )) {
85  throw std::runtime_error(
86  "Tree provided for labelled_tree() is not compatible "
87  "with the Tree of the provided Sample."
88  );
89  }
90 
91  // Get a list of the edge pointers so that we can iterate and add to the tree at the same time.
92  std::vector< tree::TreeEdge* > edge_list;
93  edge_list.reserve( result.edge_count() );
94  for( auto const& edge : result.edges() ) {
95  edge_list.push_back( edge.get() );
96  }
97  assert( edge_list.size() == sample.tree().edge_count() );
98 
99  // Helper struct that stores a placement along with its containing pquery.
100  struct PlacementPair
101  {
102  PlacementPair( Pquery const* pq, PqueryPlacement const* pp )
103  : pquery( pq )
104  , placement( pp )
105  {}
106 
107  Pquery const* pquery;
108  PqueryPlacement const* placement;
109  };
110 
111  // For each edge, get a list of placements - but only the most probable ones per Pquery.
112  // This is similar to placements_per_edge(), but we also need a pointer to the pquery, in
113  // order to get its name(s).
114  auto placement_pairs_per_edge = []( Sample const& sample ){
115  std::vector< std::vector< PlacementPair >> place_map;
116  place_map.resize( sample.tree().edge_count() );
117  for( auto const& pqry : sample.pqueries() ) {
118 
119  // Find the most probable placement.
120  auto max_it = std::max_element(
121  pqry.placements().begin(),
122  pqry.placements().end(),
123  []( PqueryPlacement const& lhs, PqueryPlacement const& rhs ){
124  return lhs.like_weight_ratio < rhs.like_weight_ratio;
125  }
126  );
127 
128  // If there is one (i.e., the pquery has at least one placement),
129  // add it to the list for its edge.
130  if( max_it != pqry.placements().end() ) {
131  place_map[ max_it->edge().index() ].emplace_back( &pqry, &*max_it );
132  }
133 
134  // Alternative handcrafted version...
135  // PqueryPlacement const* max_p = nullptr;
136  // double max_v = std::numeric_limits<double>::lowest();
137  // for( auto const& place : pqry.placements() ) {
138  // if( place.like_weight_ratio > max_v ) {
139  // max_v = place.like_weight_ratio;
140  // max_p = &place;
141  // }
142  // }
143  // if( max_p ) {
144  // place_map[ max_p->edge().index() ].emplace_back( &pqry, max_p );
145  // }
146  }
147  return place_map;
148  };
149  auto place_map = placement_pairs_per_edge( sample );
150  assert( place_map.size() == edge_list.size() );
151 
152  // Sort the placements per edge, so that we can attach them to the tree in order.
153  // For a fully resolved tree, we want to sort by proximal length, as this is the order in which
154  // the placements will form new nodes on the tree.
155  // For a multifurcating tree, we sort by pendant length, so that the short ones come first.
156  auto sort_placements = fully_resolve
157  ? []( PlacementPair const& lhs, PlacementPair const& rhs ){
158  return lhs.placement->proximal_length < rhs.placement->proximal_length;
159  }
160  : []( PlacementPair const& lhs, PlacementPair const& rhs ){
161  return lhs.placement->pendant_length < rhs.placement->pendant_length;
162  }
163  ;
164  for( auto& edge_l : place_map ) {
165  std::sort( edge_l.begin(), edge_l.end(), sort_placements );
166  }
167 
168  // -----------------------------------------------------
169  // Single Placement on the Edge
170  // -----------------------------------------------------
171 
172  // Helper function that adds one placement, if it is the only one on its edge.
173  auto add_lonely_placement = [ &name_prefix ](
174  tree::Tree& tree,
175  tree::TreeEdge& edge,
176  PlacementPair const& placement_pair
177  ) {
178  // Add the new edges to the tree and get all necessary edges.
179  auto& pendant_edge = tree::add_new_node( tree, edge );
180  auto& proximal_edge = pendant_edge.primary_link().next().edge();
181  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
182 
183  // The primary node is new, so it should be bifurcating and a leaf.
184  assert( pendant_edge.primary_node().rank() == 2 );
185  assert( pendant_edge.secondary_node().is_leaf() );
186 
187  // Some shorthands to the edge data.
188  auto& pend_data = pendant_edge.data<tree::DefaultEdgeData>();
189  auto& prox_data = proximal_edge.data<tree::DefaultEdgeData>();
190  auto& dist_data = distal_edge.data<tree::DefaultEdgeData>();
191 
192  // The pendant and distal edges are new, so they should have default branch lengths.
193  assert( pend_data.branch_length == 0.0 );
194  assert( dist_data.branch_length == 0.0 );
195 
196  // Set all three branch lengths. Make sure they are not negative.
197  double const distal_len = prox_data.branch_length - placement_pair.placement->proximal_length;
198  pend_data.branch_length = placement_pair.placement->pendant_length;
199  prox_data.branch_length = placement_pair.placement->proximal_length;
200  dist_data.branch_length = std::max( 0.0, distal_len );
201 
202  // Set the leaf node name, if there is one. Also, assert that ther is at most one -
203  // otherwise, this is not a lonely placement any more!
204  assert( placement_pair.pquery->name_size() <= 1 );
205  if( placement_pair.pquery->name_size() == 1 ) {
206  auto& pendant_node_data = pendant_edge.secondary_node().data<tree::DefaultNodeData>();
207  pendant_node_data.name = name_prefix + placement_pair.pquery->name_at(0).name;
208  }
209  };
210 
211  // -----------------------------------------------------
212  // Multiple Placements, Fully Resolved
213  // -----------------------------------------------------
214 
215  // Helper function that adds all placements for one edge, fully resolved.
216  auto process_edge_fully_resolved = [ &name_prefix ] (
217  tree::Tree& tree,
218  tree::TreeEdge& edge,
219  std::vector<PlacementPair> const& placement_pairs
220  ) {
221  // In each step, we add a new node, along the original edge, splitting it more and more.
222  // The branch lengths of those fragments are the differences between the proximal lengths
223  // of the placements.
224  // To handle this, store the proximal length that we already used along the branch.
225  double used_length = 0.0;
226 
227  // Also, we need the original branch length, because it will be lost in the process,
228  // but later needed.
229  auto const orig_length = edge.data<tree::DefaultEdgeData>().branch_length;
230 
231  // Store an edge pointer at which we insert the next placement edge.
232  auto insertion_edge = &edge;
233 
234  // Process all placements.
235  for( auto const& placement_pair : placement_pairs ) {
236  // Each name gets its own branch.
237  for( auto const& pquery_name : placement_pair.pquery->names() ) {
238 
239  // Create the new edges.
240  auto& pendant_edge = tree::add_new_node( tree, *insertion_edge );
241  auto& proximal_edge = pendant_edge.primary_link().next().edge();
242  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
243 
244  // The primary node is new, so it should be bifurcating and a leaf.
245  assert( pendant_edge.primary_node().rank() == 2 );
246  assert( pendant_edge.secondary_node().is_leaf() );
247 
248  // Some shorthands to the edge data. We dont need dist data; its branch length
249  // will be set in a later step. Either it becomes the prox edge in the next
250  // iteration, or it is the final edge after the loop.
251  auto& pend_data = pendant_edge.data<tree::DefaultEdgeData>();
252  auto& prox_data = proximal_edge.data<tree::DefaultEdgeData>();
253 
254  // The pendant and distal edges are new, so they should have default branch lengths.
255  assert( pend_data.branch_length == 0.0 );
256  assert( distal_edge.data<tree::DefaultEdgeData>().branch_length == 0.0 );
257 
258  // We sorted the placements by prox len. Thus, this should hold.
259  assert( placement_pair.placement->proximal_length >= used_length );
260 
261  // Set branch properties.
262  pend_data.branch_length = placement_pair.placement->pendant_length;
263  prox_data.branch_length = placement_pair.placement->proximal_length - used_length;
264 
265  // Set the leaf name.
266  auto& node_data = pendant_edge.secondary_node().data<tree::DefaultNodeData>();
267  node_data.name = name_prefix + pquery_name.name;
268 
269  // Set new values for the keeping-track variables, for the next iteration.
270  used_length = placement_pair.placement->proximal_length;
271  insertion_edge = &distal_edge;
272  }
273  }
274 
275  // Now, the insertion edge is the last remaining edge part towards the original
276  // end of the edge. Its branch length is not yet set. Assert this, then set it to what
277  // is left of it.
278  // If the remaining length is however smaller than the original one, that means we have
279  // placements on this edge with a proximal length greater than the original edge length.
280  // This is a bit weird in terms of placement, but might happen due to BLO in the placement.
281  // In this case, we want to avoid negative length, so simply set it to zero.
282  // That means that in total the edge now grew due to its placements. So the tree is not
283  // optimized any more - but for the purposes of this function, this should be okay.
284  auto& end_edge_data = insertion_edge->data<tree::DefaultEdgeData>();
285  assert( end_edge_data.branch_length == 0.0 );
286  if( used_length < orig_length ) {
287  end_edge_data.branch_length = orig_length - used_length;
288  }
289  };
290 
291  // -----------------------------------------------------
292  // Multiple Placements, Multifurcating
293  // -----------------------------------------------------
294 
295  // Helper function that adds all placements for one edge, multifurcating,
296  // if there is more than one placement for the edge.
297  auto process_edge_multifurcating = [ &name_prefix ] (
298  tree::Tree& tree,
299  tree::TreeEdge& edge,
300  std::vector<PlacementPair> const& placement_pairs
301  ) {
302  // Add a new leaf node to the tree, attached to the middle of the given edge (this spltis
303  // the edge in half and adds one more node there). This will be the base to which we then
304  // attach all further nodes, multifurcating.
305  auto& base_edge = tree::add_new_node( tree, edge );
306  auto& pri_edge = base_edge.primary_link().next().edge();
307  auto& sec_edge = base_edge.primary_link().next().next().edge();
308 
309  // The base node is new, so it should be bifurcating and a leaf
310  assert( base_edge.primary_node().rank() == 2 );
311  assert( base_edge.secondary_node().is_leaf() );
312 
313  // Some shorthands to the edge data.
314  auto& base_data = base_edge.data<tree::DefaultEdgeData>();
315  auto& pri_data = pri_edge.data<tree::DefaultEdgeData>();
316  auto& sec_data = sec_edge.data<tree::DefaultEdgeData>();
317 
318  // The pendant and distal edges are new, so they should have default branch lengths.
319  assert( base_data.branch_length == 0.0 );
320  assert( sec_data.branch_length == 0.0 );
321 
322  // Calculate the average attachment position (proximal length) of all placements.
323  auto const avg_prox_len = std::accumulate(
324  placement_pairs.begin(),
325  placement_pairs.end(),
326  0.0,
327  []( double in, PlacementPair const& cur ){
328  return in + cur.placement->proximal_length;
329  }
330  ) / static_cast<double>( placement_pairs.size() );
331 
332  // Set the branch lengths of the two splitted edges (the ones that were one edge before)
333  // so that they each take their share of the original branch length, divided using
334  // the average proximal length of the placements. This way, the multifurcation that contains
335  // the placements will sit at the sweet spot.
336  // Set the second one first, as it uses the former length of the first one (i.e., the
337  // original branch length).
338  // Also, make sure that each new length is non-negative. This might happen if
339  // the placements are somehow weirdly placed.
340  sec_data.branch_length = std::max( 0.0, pri_data.branch_length - avg_prox_len );
341  pri_data.branch_length = std::max( 0.0, avg_prox_len );
342 
343  // This function should only be called with at least one placement. Assert this.
344  // Then, we can safely get the placement with the smallest pendant length.
345  assert( placement_pairs.size() > 0 );
346  auto const min_pen_len = std::max( 0.0, std::min_element(
347  placement_pairs.begin(),
348  placement_pairs.end(),
349  []( PlacementPair const& lhs, PlacementPair const& rhs ){
350  return lhs.placement->pendant_length < rhs.placement->pendant_length;
351  }
352  )->placement->pendant_length );
353 
354  // TODO we sorted them first, so taking the first placement should also work:
355  // (if so, we can drop the above search)
356  assert( min_pen_len == placement_pairs[0].placement->pendant_length );
357 
358  // Use the min bl for the multifurcation base edge.
359  base_data.branch_length = min_pen_len;
360 
361  // Now add all placements as edges to the base node (secondary node of the base edge).
362  for( auto const& placement_pair : placement_pairs ) {
363  // Each name gets its own branch.
364  for( auto const& pquery_name : placement_pair.pquery->names() ) {
365 
366  // Make a new leaf node for this placement.
367  auto& p_edge = tree::add_new_node( tree, base_edge.secondary_node() );
368 
369  // Set the pendant branch length. It is subtracted by
370  // the min pen length, as this is already incorporated into the base's bl.
371  auto& p_data = p_edge.data<tree::DefaultEdgeData>();
372  assert( placement_pair.placement->pendant_length >= min_pen_len );
373  p_data.branch_length = placement_pair.placement->pendant_length - min_pen_len;
374 
375  // Set the leaf node name.
376  auto& p_node_data = p_edge.secondary_node().data<tree::DefaultNodeData>();
377  p_node_data.name = name_prefix + pquery_name.name;
378  }
379  }
380  };
381 
382  // -----------------------------------------------------
383  // Main Loop over Edges
384  // -----------------------------------------------------
385 
386  // Process each edge.
387  for( size_t edge_i = 0; edge_i < edge_list.size(); ++edge_i ) {
388  // Some safety. There needs to be a valid pointer to an edge.
389  assert( edge_list[ edge_i ] );
390 
391  // Nothing to do if there are no placements for this edge.
392  if( place_map[ edge_i ].size() == 0 ) {
393  continue;
394  }
395 
396  // If there is only one placement with at most one name,
397  // both algorithms behave the same, so shortcut this.
398  if( place_map[ edge_i ].size() == 1 && place_map[ edge_i ][0].pquery->name_size() <= 1 ) {
399  assert( place_map[ edge_i ][0].pquery && place_map[ edge_i ][0].placement );
400  add_lonely_placement( result, *edge_list[ edge_i ], place_map[ edge_i ][0] );
401  continue;
402  }
403 
404  // Select which algorithm to use for more than one placement at the edge.
405  if( fully_resolve ) {
406  process_edge_fully_resolved(
407  result, *edge_list[ edge_i ], place_map[ edge_i ]
408  );
409  } else {
410  process_edge_multifurcating(
411  result, *edge_list[ edge_i ], place_map[ edge_i ]
412  );
413  }
414  }
415 
416  return result;
417 }
418 
419 } // namespace placement
420 } // namespace genesis
DefaultTree convert_to_default_tree(Tree const &source_tree)
Convert a Tree to a DefaultTree with DefaultNodeData and DefaultEdgeData.
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:119
TreeEdge & add_new_node(Tree &tree, TreeNode &target_node)
Add a new Node as a leaf to an existing Node.
A pquery holds a set of PqueryPlacements and a set of PqueryNames.
Definition: pquery.hpp:82
Tree operator functions.
Default class containing the commonly needed data for tree nodes.
One placement position of a Pquery on a Tree.
Default class containing the commonly needed data for tree edges.
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:259
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:95
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.cpp:358
double branch_length
Branch length of the edge.
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on...
Definition: sample.hpp:68
std::string name
Name of the node.
Header of Tree class.
tree::Tree labelled_tree(Sample const &sample, bool const fully_resolve, std::string const &name_prefix)
Produce a Tree where the most probable PqueryPlacement of each Pquery in a Sample is turned into an E...
bool identical_topology(Tree const &lhs, Tree const &rhs)
Returns true iff both trees have an identical topology.
double like_weight_ratio
Likelihood weight ratio of this placement.
EdgeDataType & data()
Definition: edge.hpp:118