A library for working with phylogenetic and population genetic data.
v0.32.0
placement/function/tree.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
39 #include "genesis/tree/tree.hpp"
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <numeric>
44 #include <stdexcept>
45 #include <utility>
46 #include <vector>
47 
48 namespace genesis {
49 namespace placement {
50 
51 // =================================================================================================
52 // Local Helper Functions
53 // =================================================================================================
54 
55 // Helper struct that stores a placement along with its containing pquery.
56 struct LabelledTreePlacementPair
57 {
58  LabelledTreePlacementPair( Pquery const* pq, PqueryPlacement const* pp )
59  : pquery( pq )
60  , placement( pp )
61  {}
62 
63  Pquery const* pquery;
64  PqueryPlacement const* placement;
65 };
66 
67 // -----------------------------------------------------
68 // Placement Pairs per Edge
69 // -----------------------------------------------------
70 
71 // For each edge, get a list of placements - but only the most probable ones per Pquery.
72 // This is similar to placements_per_edge(), but we also need a pointer to the pquery, in
73 // order to get its name(s).
74 std::vector< std::vector< LabelledTreePlacementPair >> labelled_tree_placement_pairs_per_edge_(
75  Sample const& sample
76 ) {
77  std::vector< std::vector< LabelledTreePlacementPair >> place_map;
78  place_map.resize( sample.tree().edge_count() );
79  for( auto const& pqry : sample.pqueries() ) {
80 
81  // Find the most probable placement.
82  auto max_it = std::max_element(
83  pqry.placements().begin(),
84  pqry.placements().end(),
85  []( PqueryPlacement const& lhs, PqueryPlacement const& rhs ){
86  return lhs.like_weight_ratio < rhs.like_weight_ratio;
87  }
88  );
89 
90  // If there is one (i.e., the pquery has at least one placement),
91  // add it to the list for its edge.
92  if( max_it != pqry.placements().end() ) {
93  place_map[ max_it->edge().index() ].emplace_back( &pqry, &*max_it );
94  }
95 
96  // Alternative handcrafted version...
97  // PqueryPlacement const* max_p = nullptr;
98  // double max_v = std::numeric_limits<double>::lowest();
99  // for( auto const& place : pqry.placements() ) {
100  // if( place.like_weight_ratio > max_v ) {
101  // max_v = place.like_weight_ratio;
102  // max_p = &place;
103  // }
104  // }
105  // if( max_p ) {
106  // place_map[ max_p->edge().index() ].emplace_back( &pqry, max_p );
107  // }
108  }
109  return place_map;
110 }
111 
112 // -----------------------------------------------------
113 // Single Placement on the Edge
114 // -----------------------------------------------------
115 
116 // Helper function that adds one placement, if it is the only one on its edge.
118  tree::Tree& tree,
119  tree::TreeEdge& edge,
120  LabelledTreePlacementPair const& placement_pair,
121  std::string const& name_prefix
122 ) {
123  // Add the new edges to the tree and get all necessary edges.
124  auto& pendant_node = tree::add_new_leaf_node( tree, edge );
125  auto& pendant_edge = pendant_node.link().edge();
126  auto& proximal_edge = pendant_edge.primary_link().next().edge();
127  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
128 
129  // The inner node is new, so it should be bifurcating. The other one is a leaf.
130  assert( degree( pendant_edge.primary_node() ) == 3 );
131  assert( is_leaf( pendant_edge.secondary_node() ));
132 
133  // Some shorthands to the edge data.
134  auto& pend_data = pendant_edge.data<tree::CommonEdgeData>();
135  auto& prox_data = proximal_edge.data<tree::CommonEdgeData>();
136  auto& dist_data = distal_edge.data<tree::CommonEdgeData>();
137 
138  // The pendant and distal edges are new, so they should have default branch lengths.
139  assert( pend_data.branch_length == 0.0 );
140  assert( dist_data.branch_length == 0.0 );
141 
142  // Set all three branch lengths. Make sure they are not negative.
143  double const distal_len = prox_data.branch_length - placement_pair.placement->proximal_length;
144  pend_data.branch_length = placement_pair.placement->pendant_length;
145  prox_data.branch_length = placement_pair.placement->proximal_length;
146  dist_data.branch_length = std::max( 0.0, distal_len );
147 
148  // Set the leaf node name, if there is one. Also, assert that ther is at most one -
149  // otherwise, this is not a lonely placement any more!
150  assert( placement_pair.pquery->name_size() <= 1 );
151  if( placement_pair.pquery->name_size() == 1 ) {
152  auto& pendant_node_data = pendant_node.data<tree::CommonNodeData>();
153  pendant_node_data.name = name_prefix + placement_pair.pquery->name_at(0).name;
154  }
155 }
156 
157 // -----------------------------------------------------
158 // Multiple Placements, Fully Resolved
159 // -----------------------------------------------------
160 
161 // Helper function that adds all placements for one edge, fully resolved.
163  tree::Tree& tree,
164  tree::TreeEdge& edge,
165  std::vector<LabelledTreePlacementPair> const& placement_pairs,
166  std::string const& name_prefix
167 ) {
168  // In each step, we add a new node, along the original edge, splitting it more and more.
169  // The branch lengths of those fragments are the differences between the proximal lengths
170  // of the placements.
171  // To handle this, store the proximal length that we already used along the branch.
172  double used_length = 0.0;
173 
174  // Also, we need the original branch length, because it will be lost in the process,
175  // but later needed.
176  auto const orig_length = edge.data<tree::CommonEdgeData>().branch_length;
177 
178  // Store an edge pointer at which we insert the next placement edge.
179  auto insertion_edge = &edge;
180 
181  // Process all placements.
182  for( auto const& placement_pair : placement_pairs ) {
183  // Each name gets its own branch.
184  for( auto const& pquery_name : placement_pair.pquery->names() ) {
185 
186  // Create the new edges.
187  auto& pendant_node = tree::add_new_leaf_node( tree, *insertion_edge );
188  auto& pendant_edge = pendant_node.link().edge();
189  auto& proximal_edge = pendant_edge.primary_link().next().edge();
190  auto& distal_edge = pendant_edge.primary_link().next().next().edge();
191 
192  // The primary node is new, so it should be bifurcating and a leaf.
193  assert( degree( pendant_edge.primary_node() ) == 3 );
194  assert( is_leaf( pendant_edge.secondary_node() ));
195 
196  // Some shorthands to the edge data. We dont need dist data; its branch length
197  // will be set in a later step. Either it becomes the prox edge in the next
198  // iteration, or it is the final edge after the loop.
199  auto& pend_data = pendant_edge.data<tree::CommonEdgeData>();
200  auto& prox_data = proximal_edge.data<tree::CommonEdgeData>();
201 
202  // The pendant and distal edges are new, so they should have default branch lengths.
203  assert( pend_data.branch_length == 0.0 );
204  assert( distal_edge.data<tree::CommonEdgeData>().branch_length == 0.0 );
205 
206  // We sorted the placements by prox len. Thus, this should hold.
207  assert( placement_pair.placement->proximal_length >= used_length );
208 
209  // Set branch properties.
210  pend_data.branch_length = placement_pair.placement->pendant_length;
211  prox_data.branch_length = placement_pair.placement->proximal_length - used_length;
212 
213  // Set the leaf name.
214  auto& node_data = pendant_node.data<tree::CommonNodeData>();
215  node_data.name = name_prefix + pquery_name.name;
216 
217  // Set new values for the keeping-track variables, for the next iteration.
218  used_length = placement_pair.placement->proximal_length;
219  insertion_edge = &distal_edge;
220  }
221  }
222 
223  // Now, the insertion edge is the last remaining edge part towards the original
224  // end of the edge. Its branch length is not yet set. Assert this, then set it to what
225  // is left of it.
226  // If the remaining length is however smaller than the original one, that means we have
227  // placements on this edge with a proximal length greater than the original edge length.
228  // This is a bit weird in terms of placement, but might happen due to BLO in the placement.
229  // In this case, we want to avoid negative length, so simply set it to zero.
230  // That means that in total the edge now grew due to its placements. So the tree is not
231  // optimized any more - but for the purposes of this function, this should be okay.
232  auto& end_edge_data = insertion_edge->data<tree::CommonEdgeData>();
233  assert( end_edge_data.branch_length == 0.0 );
234  if( used_length < orig_length ) {
235  end_edge_data.branch_length = orig_length - used_length;
236  }
237 }
238 
239 // -----------------------------------------------------
240 // Multiple Placements, Multifurcating
241 // -----------------------------------------------------
242 
243 // Helper function that adds all placements for one edge, multifurcating,
244 // if there is more than one placement for the edge.
246  tree::Tree& tree,
247  tree::TreeEdge& edge,
248  std::vector<LabelledTreePlacementPair> const& placement_pairs,
249  std::string const& name_prefix
250 ) {
251  // Add a new leaf node to the tree, attached to the middle of the given edge (this spltis
252  // the edge in half and adds one more node there). This will be the base to which we then
253  // attach all further nodes, multifurcating.
254  auto& new_node = tree::add_new_leaf_node( tree, edge );
255  auto& base_edge = new_node.link().edge();
256  auto& pri_edge = base_edge.primary_link().next().edge();
257  auto& sec_edge = base_edge.primary_link().next().next().edge();
258 
259  // The base node is new, so it should be bifurcating and a leaf
260  assert( degree( base_edge.primary_node() ) == 3 );
261  assert( is_leaf( base_edge.secondary_node() ));
262 
263  // Some shorthands to the edge data.
264  auto& base_data = base_edge.data<tree::CommonEdgeData>();
265  auto& pri_data = pri_edge.data<tree::CommonEdgeData>();
266  auto& sec_data = sec_edge.data<tree::CommonEdgeData>();
267 
268  // The pendant and distal edges are new, so they should have default branch lengths.
269  assert( base_data.branch_length == 0.0 );
270  assert( sec_data.branch_length == 0.0 );
271 
272  // Calculate the average attachment position (proximal length) of all placements.
273  auto const avg_prox_len = std::accumulate(
274  placement_pairs.begin(),
275  placement_pairs.end(),
276  0.0,
277  []( double in, LabelledTreePlacementPair const& cur ){
278  return in + cur.placement->proximal_length;
279  }
280  ) / static_cast<double>( placement_pairs.size() );
281 
282  // Set the branch lengths of the two splitted edges (the ones that were one edge before)
283  // so that they each take their share of the original branch length, divided using
284  // the average proximal length of the placements. This way, the multifurcation that contains
285  // the placements will sit at the sweet spot.
286  // Set the second one first, as it uses the former length of the first one (i.e., the
287  // original branch length).
288  // Also, make sure that each new length is non-negative. This might happen if
289  // the placements are somehow weirdly placed.
290  sec_data.branch_length = std::max( 0.0, pri_data.branch_length - avg_prox_len );
291  pri_data.branch_length = std::max( 0.0, avg_prox_len );
292 
293  // This function should only be called with at least one placement. Assert this.
294  // Then, we can safely get the placement with the smallest pendant length.
295  assert( placement_pairs.size() > 0 );
296  auto const min_pen_len = std::max( 0.0, std::min_element(
297  placement_pairs.begin(),
298  placement_pairs.end(),
299  []( LabelledTreePlacementPair const& lhs, LabelledTreePlacementPair const& rhs ){
300  return lhs.placement->pendant_length < rhs.placement->pendant_length;
301  }
302  )->placement->pendant_length );
303 
304  // TODO we sorted them first, so taking the first placement should also work:
305  // (if so, we can drop the above search)
306  assert( min_pen_len == placement_pairs[0].placement->pendant_length );
307 
308  // Use the min bl for the multifurcation base edge.
309  base_data.branch_length = min_pen_len;
310 
311  // Now add all placements as edges to the base node (secondary node of the base edge).
312  for( auto const& placement_pair : placement_pairs ) {
313  // Each name gets its own branch.
314  for( auto const& pquery_name : placement_pair.pquery->names() ) {
315 
316  // Make a new leaf node for this placement.
317  auto& p_node = tree::add_new_node( tree, new_node );
318  auto& p_edge = p_node.link().edge();
319 
320  // Set the pendant branch length. It is subtracted by
321  // the min pen length, as this is already incorporated into the base's bl.
322  auto& p_data = p_edge.data<tree::CommonEdgeData>();
323  assert( placement_pair.placement->pendant_length >= min_pen_len );
324  p_data.branch_length = placement_pair.placement->pendant_length - min_pen_len;
325 
326  // Set the leaf node name.
327  auto& p_node_data = p_node.data<tree::CommonNodeData>();
328  p_node_data.name = name_prefix + pquery_name.name;
329  }
330  }
331 }
332 
333 // =================================================================================================
334 // Helper Functions
335 // =================================================================================================
336 
338  Sample const& sample,
339  bool const fully_resolve,
340  std::string const& name_prefix
341 ) {
342  // Get a copy of the original tree that contains only common data.
343  return labelled_tree(
344  sample,
346  fully_resolve,
347  name_prefix
348  );
349 }
350 
352  Sample const& sample,
353  tree::Tree const& tree,
354  bool fully_resolve,
355  std::string const& name_prefix
356 ) {
357  // -----------------------------------------------------
358  // Preparation
359  // -----------------------------------------------------
360 
361  // Get a copy of the original tree that we can add edges to.
362  auto result = tree;
363 
364  // Check whether the tree is compatible with the sample tree.
365  // This is a bit wasteful for cases when this function is called directly from its sample
366  // version above (then, we can be sure that it is compatible), but necessary otherwise.
367  if( ! tree::identical_topology( result, sample.tree() )) {
368  throw std::runtime_error(
369  "Tree provided for labelled_tree() is not compatible "
370  "with the Tree of the provided Sample."
371  );
372  }
373 
374  // Get a list of the edge pointers so that we can iterate and add to the tree at the same time.
375  std::vector< tree::TreeEdge* > edge_list;
376  edge_list.reserve( result.edge_count() );
377  for( auto& edge : result.edges() ) {
378  edge_list.push_back( &edge );
379  }
380  assert( edge_list.size() == sample.tree().edge_count() );
381 
382  auto place_map = labelled_tree_placement_pairs_per_edge_( sample );
383  assert( place_map.size() == edge_list.size() );
384 
385  // Sort the placements per edge, so that we can attach them to the tree in order.
386  // For a fully resolved tree, we want to sort by proximal length, as this is the order in which
387  // the placements will form new nodes on the tree.
388  // For a multifurcating tree, we sort by pendant length, so that the short ones come first.
389  auto sort_placements = fully_resolve
390  ? []( LabelledTreePlacementPair const& lhs, LabelledTreePlacementPair const& rhs ){
391  return lhs.placement->proximal_length < rhs.placement->proximal_length;
392  }
393  : []( LabelledTreePlacementPair const& lhs, LabelledTreePlacementPair const& rhs ){
394  return lhs.placement->pendant_length < rhs.placement->pendant_length;
395  }
396  ;
397  for( auto& edge_l : place_map ) {
398  std::sort( edge_l.begin(), edge_l.end(), sort_placements );
399  }
400 
401  // -----------------------------------------------------
402  // Main Loop over Edges
403  // -----------------------------------------------------
404 
405  // Process each edge.
406  for( size_t edge_i = 0; edge_i < edge_list.size(); ++edge_i ) {
407  // Some safety. There needs to be a valid pointer to an edge.
408  assert( edge_list[ edge_i ] );
409 
410  // Nothing to do if there are no placements for this edge.
411  if( place_map[ edge_i ].size() == 0 ) {
412  continue;
413  }
414 
415  // If there is only one placement with at most one name,
416  // both algorithms behave the same, so shortcut this.
417  if( place_map[ edge_i ].size() == 1 && place_map[ edge_i ][0].pquery->name_size() <= 1 ) {
418  assert( place_map[ edge_i ][0].pquery && place_map[ edge_i ][0].placement );
420  result, *edge_list[ edge_i ], place_map[ edge_i ][0], name_prefix
421  );
422  continue;
423  }
424 
425  // Select which algorithm to use for more than one placement at the edge.
426  if( fully_resolve ) {
428  result, *edge_list[ edge_i ], place_map[ edge_i ], name_prefix
429  );
430  } else {
432  result, *edge_list[ edge_i ], place_map[ edge_i ], name_prefix
433  );
434  }
435  }
436 
437  return result;
438 }
439 
440 } // namespace placement
441 } // namespace genesis
genesis::placement::Sample::tree
PlacementTree & tree()
Get the PlacementTree of this Sample.
Definition: sample.cpp:124
genesis::placement::labelled_tree_process_edge_multifurcating_
void labelled_tree_process_edge_multifurcating_(tree::Tree &tree, tree::TreeEdge &edge, std::vector< LabelledTreePlacementPair > const &placement_pairs, std::string const &name_prefix)
Definition: placement/function/tree.cpp:245
genesis::placement::Sample
Manage a set of Pqueries along with the PlacementTree where the PqueryPlacements are placed on.
Definition: sample.hpp:68
tree.hpp
Header of Tree class.
genesis::placement::labelled_tree_add_lonely_placement_
void labelled_tree_add_lonely_placement_(tree::Tree &tree, tree::TreeEdge &edge, LabelledTreePlacementPair const &placement_pair, std::string const &name_prefix)
Definition: placement/function/tree.cpp:117
functions.hpp
genesis::tree::add_new_node
TreeNode & add_new_node(Tree &tree, TreeNode &target_node)
Add a new Node as a leaf to an existing Node.
Definition: tree/function/manipulation.cpp:109
genesis::tree::convert_to_common_tree
CommonTree convert_to_common_tree(Tree const &source_tree)
Convert a Tree to a CommonTree with CommonNodeData and CommonEdgeData.
Definition: tree/common_tree/operators.cpp:95
genesis::tree::add_new_leaf_node
TreeNode & add_new_leaf_node(Tree &tree, TreeEdge &target_edge, std::function< void(TreeEdge &target_edge, TreeEdge &new_edge)> adjust_edges)
Add a new Node as a leaf to an existing Edge, by also adding a new Node in the middle of that Edge.
Definition: tree/function/manipulation.cpp:263
genesis::tree::degree
size_t degree(TreeLink const &link)
Return the degree of the node for a given TreeLink, i.e. how many neighbouring nodes it has.
Definition: tree/function/functions.cpp:103
genesis::tree::identical_topology
bool identical_topology(Tree const &lhs, Tree const &rhs, bool identical_indices)
Return whether both trees have an identical topology.
Definition: tree/function/operators.cpp:169
genesis::tree::Tree
Class for representing phylogenetic trees.
Definition: tree/tree.hpp:97
operators.hpp
genesis::tree::CommonEdgeData::branch_length
double branch_length
Branch length of the edge.
Definition: tree/common_tree/tree.hpp:193
genesis::placement::labelled_tree_placement_pairs_per_edge_
std::vector< std::vector< LabelledTreePlacementPair > > labelled_tree_placement_pairs_per_edge_(Sample const &sample)
Definition: placement/function/tree.cpp:74
genesis::tree::CommonNodeData::name
std::string name
Name of the node.
Definition: tree/common_tree/tree.hpp:127
genesis::tree::TreeEdge
Definition: edge.hpp:60
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
manipulation.hpp
genesis::placement::labelled_tree_process_edge_fully_resolved_
void labelled_tree_process_edge_fully_resolved_(tree::Tree &tree, tree::TreeEdge &edge, std::vector< LabelledTreePlacementPair > const &placement_pairs, std::string const &name_prefix)
Definition: placement/function/tree.cpp:162
operators.hpp
Tree operator functions.
tree.hpp
genesis::tree::CommonEdgeData
Common class containing the commonly needed data for tree edges.
Definition: tree/common_tree/tree.hpp:144
genesis::placement::PqueryPlacement
One placement position of a Pquery on a Tree.
Definition: placement/pquery/placement.hpp:75
tree.hpp
genesis::tree::CommonNodeData
Common class containing the commonly needed data for tree nodes.
Definition: tree/common_tree/tree.hpp:79
genesis::tree::TreeEdge::data
EdgeDataType & data()
Definition: edge.hpp:217
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
genesis::placement::Sample::pqueries
utils::Range< iterator_pqueries > pqueries()
Return a Range iterator to the Pqueries .
Definition: sample.cpp:264
genesis::placement::labelled_tree
tree::Tree labelled_tree(Sample const &sample, bool const fully_resolve, std::string const &name_prefix)
Produce a Tree where the most probable PqueryPlacement of each Pquery in a Sample is turned into an E...
Definition: placement/function/tree.cpp:337
helper.hpp
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272