A library for working with phylogenetic and population genetic data.
v0.27.0
phylo_factor.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 #include "genesis/tree/tree.hpp"
42 
46 
47 #include <cassert>
48 #include <cmath>
49 #include <stdexcept>
50 #include <string>
51 #include <utility>
52 #include <vector>
53 
54 #ifdef GENESIS_OPENMP
55 # include <omp.h>
56 #endif
57 
58 namespace genesis {
59 namespace tree {
60 
61 // =================================================================================================
62 // Phylogenetic Factorization
63 // =================================================================================================
64 
65 std::unordered_set<size_t> phylo_factor_subtree_indices(
66  Subtree const& subtree,
67  std::unordered_set<size_t> const& candidate_edges
68 ) {
69  // Prepare result
70  std::unordered_set<size_t> sub_indices;
71 
72  // Given a link, find the first other link of the same node that is a valid direction
73  // to proceed the traversal. Valid means: it is in the candidate edges, and it is not
74  // a leaf. If no such direction is found, a nullptr is returned.
75  // If leaves are found (links of the node that lead to leaf nodes), we also need
76  // to add them to our result list.
77  // So, basically, this function is a replacement for the next() step in normal tree traversal,
78  // which just might skip some nexts.
79  auto find_valid_next = [&]( TreeLink const* link_ptr ){
80  auto next_ptr = &link_ptr->next();
81  while( next_ptr != link_ptr ) {
82 
83  if( is_leaf( next_ptr->edge() )) {
84 
85  // We do not need to move down to leaves, but store them in our result list.
86  assert( sub_indices.count( next_ptr->edge().index() ) == 0 );
87  sub_indices.insert( next_ptr->edge().index() );
88  } else if( candidate_edges.count( next_ptr->edge().index() ) > 0 ) {
89 
90  // If we found an edge adjacent to the node that is in the candidates list,
91  // we can stop here, as is is what we are looking for.
92  break;
93  }
94 
95  // If we are here, the edge is not in the candidates - either because it has been
96  // removed for a previous factor, or because it is a leaf edge. In both cases, it is
97  // not one we are looking for (no need to go down this edge), so move to the next.
98  assert( candidate_edges.count( next_ptr->edge().index() ) == 0 );
99  next_ptr = &next_ptr->next();
100  }
101 
102  // We treat the special case of "no valid direction found" here, so that the caller
103  // has it easier dealing with that case, and we make it easiert to catch accidents
104  // this way (as they will lead to a segfault when trying to deref the nullptr,
105  // instead of silently succeeding).
106  if( next_ptr == link_ptr ) {
107  next_ptr = nullptr;
108  }
109  return next_ptr;
110  };
111 
112  // Iterate the subtree manually, so that we can easily skip parts of it.
113  // We start at the outer of the given link, because this is what the loop expects:
114  // start from an outer link, and move down in its (yet again) outer direction.
115  // Basically, this loop does a normal outer -> next -> outer -> next ... traversal of the tree,
116  // but might skip some nexts in between.
117  auto link_ptr = &subtree.link().outer();
118  while( link_ptr != &subtree.link() ) {
119 
120  // At the beginning of the loop, we are at the outer link of the node that we want
121  // to consider. Move to the node.
122  // This is the outer() part of normal tree traversal.
123  link_ptr = &link_ptr->outer();
124  assert( candidate_edges.count( link_ptr->edge().index() ) > 0 );
125  assert( ! is_leaf( link_ptr->edge() ));
126 
127  // Unless we are in the first iteration, add it to the result list.
128  // (In the first iteration, we are at the edge of our current candidate factor,
129  // so we do not want to include it.)
130  if( link_ptr != &subtree.link() ) {
131 
132  // Insert the edge. It might already have been inserted, if this iteration is
133  // one that goes up the tree again after finishing with a subtree.
134  sub_indices.insert( link_ptr->edge().index() );
135  }
136 
137  // Find the first subtree of that node that is part of the candidates.
138  // If there is none, we will then move up again.
139  auto next_ptr = find_valid_next( link_ptr );
140 
141  if( next_ptr ) {
142  // We found a direction where to go next. Set it as the new link pointer.
143  // The subtree is in the candidates and not a leaf.
144  // Use the link for the next iteration, so that we go down this subtree.
145  // As we always start at the outer() link of the link where we want to continue,
146  // this is simply the next node itself.
147 
148  assert( candidate_edges.count( next_ptr->edge().index() ) > 0 );
149  assert( ! is_leaf( next_ptr->edge() ));
150  link_ptr = next_ptr;
151 
152  } else {
153  // We did not find a direction where to go next (a subtree that is in the
154  // candidates). Either we move back and up the tree, or we are done.
155 
156  // Check that we are not done with the whole subtree yet. If so (the non-existent else
157  // part of the following condition), no need to do anything, as the while loop will then
158  // terminate in its next check anyway.
159  // (Yes, the following condition could simply be an else if above, but this way,
160  // it is more readable...)
161  if( link_ptr != &subtree.link() ) {
162 
163  // We must have seen the edge before on our way down.
164  assert( sub_indices.count( link_ptr->edge().index() ) > 0 );
165  assert( candidate_edges.count( link_ptr->edge().index() ) > 0 );
166 
167  // Find a valid direction to go up again. We cannot simply use outer().next() here,
168  // as this might be a leaf or not in the candidates, which we need to skip.
169  // It will however definitely find some way to go, which in a bifurcating tree
170  // is the way up - that is, the same way that got us here.
171  link_ptr = find_valid_next( &link_ptr->outer() );
172 
173  // We got here somehow, so there must be a way back.
174  assert( link_ptr );
175  assert( ! is_leaf( link_ptr->edge() ));
176  }
177  }
178  }
179 
180  return sub_indices;
181 }
182 
184  BalanceData const& data,
185  std::unordered_set<size_t> const& candidate_edges,
186  std::function<double( std::vector<double> const& balances )> objective
187 ) {
188  assert( ! data.tree.empty() );
189 
190  // Init a result that has an objective value smaller than all we will encounter.
191  PhyloFactor result;
192  result.objective_value = - std::numeric_limits<double>::infinity();
193  result.all_objective_values = std::vector<double>(
194  data.tree.edge_count(), std::numeric_limits<double>::quiet_NaN()
195  );
196 
197  // We cheat for simplicity, and create a vector of the indices as well,
198  // so that we can efficiently parallelize over it...
199  // (There are ways to use OpenMP on unordered sets, but they are ugly as hell!)
200  auto const cand_vec = std::vector<size_t>( candidate_edges.begin(), candidate_edges.end() );
201 
202  // Try out all candidate edges.
203  #pragma omp parallel for schedule(dynamic)
204  for( size_t i = 0; i < cand_vec.size(); ++i ) {
205  auto const ce_idx = cand_vec[i];
206 
207  assert( ce_idx < data.tree.edge_count() );
208  auto const& edge = data.tree.edge_at( ce_idx );
209 
210  // The calling function already leaves out edges that lead to a leaf.
211  assert( ! is_leaf( edge ));
212 
213  // Find the edges of the two subtrees induced by the split of the edge,
214  // leaving out subtrees that are not connected (that is, which are connected by an
215  // edge that is not in the candidates list).
216  // This might give empty sets, because previous factors can lead to a subtree being
217  // completely blocked.
218  // This could be optimized: an edge that yields an empty set here will also do so
219  // in all following phylo factors, so we can just completely remove it from lookup candidates.
220  // We can however not remove it from the candidates completely, as it is still part of the
221  // edges needed for calculating balances. So, we'd need another set of edges distinct from
222  // the candidates for storing which edges to use for the lookup... too complex for now!
223  auto const p_indices = phylo_factor_subtree_indices(
224  Subtree{ edge.primary_link() }, candidate_edges
225  );
226  if( p_indices.empty() ) {
227  continue;
228  }
229  auto const s_indices = phylo_factor_subtree_indices(
230  Subtree{ edge.secondary_link() }, candidate_edges
231  );
232  if( s_indices.empty() ) {
233  continue;
234  }
235 
236  // We should not have added the actual candidate edge to either of the partitions.
237  assert( s_indices.count( edge.index() ) == 0 );
238  assert( p_indices.count( edge.index() ) == 0 );
239 
240  // Calculate the balances of this edge for all trees.
241  auto const balances = mass_balance( data, s_indices, p_indices );
242 
243  // Calculate the objective function, and store it in the result.
244  auto const ov = objective( balances );
245  result.all_objective_values[ ce_idx ] = ov;
246 
247  // Update our greedy best hit if needed.
248  #pragma omp critical( GENESIS_TREE_MASS_TREE_PHYLO_FACTOR_OBJECTIVE_UPDATE )
249  {
250  if( ov > result.objective_value ) {
251  result.edge_index = ce_idx;
252  result.edge_indices_primary = p_indices;
253  result.edge_indices_secondary = s_indices;
254  result.balances = balances;
255  result.objective_value = ov;
256  }
257  }
258  }
259 
260  return result;
261 }
262 
263 std::vector<PhyloFactor> phylogenetic_factorization(
264  BalanceData const& data,
265  std::function<double( std::vector<double> const& balances )> objective,
266  size_t max_iterations,
267  std::function<void( size_t iteration, size_t max_iterations )> log_progress
268 ) {
269  // Basic checks.
270  if( data.tree.empty() ) {
271  return std::vector<PhyloFactor>{};
272  }
273 
274  // Start with all edges except for leaves as potential candidates for factors.
275  std::unordered_set<size_t> candidate_edges;
276  for( size_t i = 0; i < data.tree.edge_count(); ++i ) {
277  if( ! is_leaf( data.tree.edge_at(i) )) {
278  candidate_edges.insert( i );
279  }
280  }
281 
282  // Special value max iters == 0: get all factors. Also, reduce if too large.
283  if( max_iterations == 0 || max_iterations > candidate_edges.size() ) {
284  max_iterations = candidate_edges.size();
285  }
286 
287  // Successively find factors. This cannot be parallelized,
288  // as each iteration depends on all previous ones.
289  std::vector<PhyloFactor> result;
290  for( size_t it = 0; it < max_iterations; ++it ) {
291  assert( candidate_edges.size() > 0 );
292 
293  // Log the progress, if needed.
294  if( log_progress ) {
295  log_progress( it + 1, max_iterations );
296  }
297 
298  // Find and store the next (greedy) phylo factor.
299  result.push_back( phylo_factor_find_best_edge( data, candidate_edges, objective ));
300 
301  // Remove its edge from the candiate list.
302  assert( candidate_edges.count( result.back().edge_index ) > 0 );
303  candidate_edges.erase( result.back().edge_index );
304  }
305 
306  return result;
307 }
308 
309 } // namespace tree
310 } // namespace genesis
genesis::tree::phylo_factor_subtree_indices
std::unordered_set< size_t > phylo_factor_subtree_indices(Subtree const &subtree, std::unordered_set< size_t > const &candidate_edges)
Helper function for phylogenetic_factorization() to find the constrained subtrees that are split by a...
Definition: phylo_factor.cpp:65
genesis::tree::Subtree::link
TreeLink const & link() const
Get the TreeLink that separates the subtree from the rest of the tree.
Definition: subtree.hpp:125
phylo_factor.hpp
genesis::tree::PhyloFactor::balances
std::vector< double > balances
The balances for all Samples calculated on the two sets of edge indices of this factor.
Definition: phylo_factor.hpp:81
genesis::tree::mass_balance
std::vector< double > mass_balance(BalanceData const &data, std::unordered_set< size_t > const &numerator_edge_indices, std::unordered_set< size_t > const &denominator_edge_indices)
Calculate the balance of edge masses between two sets of edges.
Definition: balances.cpp:357
common.hpp
balances.hpp
tree.hpp
Header of Tree class.
functions.hpp
genesis::tree::PhyloFactor::edge_index
size_t edge_index
The edge that this factor found to be maximizing for the objective function.
Definition: phylo_factor.hpp:66
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
genesis::tree::PhyloFactor
A single phylogenetic factor.
Definition: phylo_factor.hpp:61
genesis::tree::PhyloFactor::all_objective_values
std::vector< double > all_objective_values
For reference, all other objective values for the other edges of the tree.
Definition: phylo_factor.hpp:98
genesis::tree::BalanceData
Data needed to calculate balances.
Definition: balances.hpp:170
tree.hpp
genesis::tree::PhyloFactor::objective_value
double objective_value
The objective value obtained from the objective function using the balances.
Definition: phylo_factor.hpp:86
subtree.hpp
functions.hpp
genesis::tree::PhyloFactor::edge_indices_secondary
std::unordered_set< size_t > edge_indices_secondary
The set of edges on the non-root (secondary) side of the edge that belongs to this factor.
Definition: phylo_factor.hpp:76
genesis::tree::Tree::empty
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
genesis::tree::Subtree
Reference to a subtree of a Tree.
Definition: subtree.hpp:69
genesis::tree::phylo_factor_find_best_edge
PhyloFactor phylo_factor_find_best_edge(BalanceData const &data, std::unordered_set< size_t > const &candidate_edges, std::function< double(std::vector< double > const &balances)> objective)
Helper function for phylogenetic_factorization() that tries all candidate edges to find the one that ...
Definition: phylo_factor.cpp:183
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
operators.hpp
Tree operator functions.
genesis::tree::phylogenetic_factorization
std::vector< PhyloFactor > phylogenetic_factorization(BalanceData const &data, std::function< double(std::vector< double > const &balances)> objective, size_t max_iterations, std::function< void(size_t iteration, size_t max_iterations)> log_progress)
Calculate the Phylogenetic Factorization (PhyloFactor) of a set of MassTrees.
Definition: phylo_factor.cpp:263
statistics.hpp
color.hpp
Header of Color class.
functions.hpp
genesis::tree::BalanceData::tree
Tree tree
The Tree on which to calculate balances.
Definition: balances.hpp:179
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
preorder.hpp
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
genesis::tree::PhyloFactor::edge_indices_primary
std::unordered_set< size_t > edge_indices_primary
The set of edges on the root (primary) side of the edge that belongs to this factor.
Definition: phylo_factor.hpp:71