A library for working with phylogenetic and population genetic data.
v0.32.0
phylo_factor.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
40 #include "genesis/tree/tree.hpp"
42 
46 
47 #include <cassert>
48 #include <cmath>
49 #include <stdexcept>
50 #include <string>
51 #include <utility>
52 #include <vector>
53 
54 #ifdef GENESIS_OPENMP
55 # include <omp.h>
56 #endif
57 
58 namespace genesis {
59 namespace tree {
60 
61 // =================================================================================================
62 // Phylogenetic Factorization
63 // =================================================================================================
64 
65 std::unordered_set<size_t> phylo_factor_subtree_indices(
66  Subtree const& subtree,
67  std::unordered_set<size_t> const& candidate_edges
68 ) {
69  // Prepare result
70  std::unordered_set<size_t> sub_indices;
71 
72  // Given a link, find the first other link of the same node that is a valid direction
73  // to proceed the traversal. Valid means: it is in the candidate edges, and it is not
74  // a leaf. If no such direction is found, a nullptr is returned.
75  // If leaves are found (links of the node that lead to leaf nodes), we also need
76  // to add them to our result list.
77  // So, basically, this function is a replacement for the next() step in normal tree traversal,
78  // which just might skip some nexts.
79  auto find_valid_next = [&]( TreeLink const* link_ptr ){
80  auto next_ptr = &link_ptr->next();
81  while( next_ptr != link_ptr ) {
82 
83  if( is_leaf( next_ptr->edge() )) {
84 
85  // We do not need to move down to leaves, but store them in our result list.
86  assert( sub_indices.count( next_ptr->edge().index() ) == 0 );
87  sub_indices.insert( next_ptr->edge().index() );
88  } else if( candidate_edges.count( next_ptr->edge().index() ) > 0 ) {
89 
90  // If we found an edge adjacent to the node that is in the candidates list,
91  // we can stop here, as is is what we are looking for.
92  break;
93  }
94 
95  // If we are here, the edge is not in the candidates - either because it has been
96  // removed for a previous factor, or because it is a leaf edge. In both cases, it is
97  // not one we are looking for (no need to go down this edge), so move to the next.
98  assert( candidate_edges.count( next_ptr->edge().index() ) == 0 );
99  next_ptr = &next_ptr->next();
100  }
101 
102  // We treat the special case of "no valid direction found" here, so that the caller
103  // has it easier dealing with that case, and we make it easiert to catch accidents
104  // this way (as they will lead to a segfault when trying to deref the nullptr,
105  // instead of silently succeeding).
106  if( next_ptr == link_ptr ) {
107  next_ptr = nullptr;
108  }
109  return next_ptr;
110  };
111 
112  // Iterate the subtree manually, so that we can easily skip parts of it.
113  // We start at the outer of the given link, because this is what the loop expects:
114  // start from an outer link, and move down in its (yet again) outer direction.
115  // Basically, this loop does a normal outer -> next -> outer -> next ... traversal of the tree,
116  // but might skip some nexts in between.
117  auto link_ptr = &subtree.link().outer();
118  while( link_ptr != &subtree.link() ) {
119 
120  // At the beginning of the loop, we are at the outer link of the node that we want
121  // to consider. Move to the node.
122  // This is the outer() part of normal tree traversal.
123  link_ptr = &link_ptr->outer();
124  assert( candidate_edges.count( link_ptr->edge().index() ) > 0 );
125  assert( ! is_leaf( link_ptr->edge() ));
126 
127  // Unless we are in the first iteration, add it to the result list.
128  // (In the first iteration, we are at the edge of our current candidate factor,
129  // so we do not want to include it.)
130  if( link_ptr != &subtree.link() ) {
131 
132  // Insert the edge. It might already have been inserted, if this iteration is
133  // one that goes up the tree again after finishing with a subtree.
134  sub_indices.insert( link_ptr->edge().index() );
135  }
136 
137  // Find the first subtree of that node that is part of the candidates.
138  // If there is none, we will then move up again.
139  auto next_ptr = find_valid_next( link_ptr );
140 
141  if( next_ptr ) {
142  // We found a direction where to go next. Set it as the new link pointer.
143  // The subtree is in the candidates and not a leaf.
144  // Use the link for the next iteration, so that we go down this subtree.
145  // As we always start at the outer() link of the link where we want to continue,
146  // this is simply the next node itself.
147 
148  assert( candidate_edges.count( next_ptr->edge().index() ) > 0 );
149  assert( ! is_leaf( next_ptr->edge() ));
150  link_ptr = next_ptr;
151 
152  } else {
153  // We did not find a direction where to go next (a subtree that is in the
154  // candidates). Either we move back and up the tree, or we are done.
155 
156  // Check that we are not done with the whole subtree yet. If so (the non-existent else
157  // part of the following condition), no need to do anything, as the while loop will then
158  // terminate in its next check anyway.
159  // (Yes, the following condition could simply be an else if above, but this way,
160  // it is more readable...)
161  if( link_ptr != &subtree.link() ) {
162 
163  // We must have seen the edge before on our way down.
164  assert( sub_indices.count( link_ptr->edge().index() ) > 0 );
165  assert( candidate_edges.count( link_ptr->edge().index() ) > 0 );
166 
167  // Find a valid direction to go up again. We cannot simply use outer().next() here,
168  // as this might be a leaf or not in the candidates, which we need to skip.
169  // It will however definitely find some way to go, which in a bifurcating tree
170  // is the way up - that is, the same way that got us here.
171  link_ptr = find_valid_next( &link_ptr->outer() );
172 
173  // We got here somehow, so there must be a way back.
174  assert( link_ptr );
175  assert( ! is_leaf( link_ptr->edge() ));
176  }
177  }
178  }
179 
180  return sub_indices;
181 }
182 
184  size_t iteration,
185  BalanceData const& data,
186  std::unordered_set<size_t> const& candidate_edges,
188 ) {
189  assert( ! data.tree.empty() );
190 
191  // Init a result that has an objective value smaller than all we will encounter.
192  PhyloFactor result;
193  result.objective_value = - std::numeric_limits<double>::infinity();
194  result.all_objective_values = std::vector<double>(
195  data.tree.edge_count(), std::numeric_limits<double>::quiet_NaN()
196  );
197 
198  // We cheat for simplicity, and create a vector of the indices as well,
199  // so that we can efficiently parallelize over it...
200  // (There are ways to use OpenMP on unordered sets, but they are ugly as hell!)
201  auto const cand_vec = std::vector<size_t>( candidate_edges.begin(), candidate_edges.end() );
202 
203  // Try out all candidate edges.
204  #pragma omp parallel for schedule(dynamic)
205  for( size_t i = 0; i < cand_vec.size(); ++i ) {
206  auto const ce_idx = cand_vec[i];
207 
208  assert( ce_idx < data.tree.edge_count() );
209  auto const& edge = data.tree.edge_at( ce_idx );
210  assert( edge.index() == ce_idx );
211 
212  // The calling function already leaves out edges that lead to a leaf.
213  assert( ! is_leaf( edge ));
214 
215  // Find the edges of the two subtrees induced by the split of the edge,
216  // leaving out subtrees that are not connected (that is, which are connected by an
217  // edge that is not in the candidates list).
218  // This might give empty sets, because previous factors can lead to a subtree being
219  // completely blocked.
220  // This could be optimized: an edge that yields an empty set here will also do so
221  // in all following phylo factors, so we can just completely remove it from lookup candidates.
222  // We can however not remove it from the candidates completely, as it is still part of the
223  // edges needed for calculating balances. So, we'd need another set of edges distinct from
224  // the candidates for storing which edges to use for the lookup... too complex for now!
225  auto const p_indices = phylo_factor_subtree_indices(
226  Subtree{ edge.primary_link() }, candidate_edges
227  );
228  if( p_indices.empty() ) {
229  continue;
230  }
231  auto const s_indices = phylo_factor_subtree_indices(
232  Subtree{ edge.secondary_link() }, candidate_edges
233  );
234  if( s_indices.empty() ) {
235  continue;
236  }
237 
238  // We should not have added the actual candidate edge to either of the partitions.
239  assert( s_indices.count( edge.index() ) == 0 );
240  assert( p_indices.count( edge.index() ) == 0 );
241 
242  // Calculate the balances of this edge for all trees.
243  auto const balances = mass_balance( data, s_indices, p_indices );
244 
245  // Calculate the objective function, and store it in the result.
246  auto const ov = objective( iteration, ce_idx, balances );
247  result.all_objective_values[ ce_idx ] = ov;
248  if( ! std::isfinite( ov )) {
249  continue;
250  }
251 
252  // Update our greedy best hit if needed.
253  #pragma omp critical( GENESIS_TREE_MASS_TREE_PHYLO_FACTOR_OBJECTIVE_UPDATE )
254  {
255  if( ov > result.objective_value ) {
256  result.edge_index = ce_idx;
257  result.edge_indices_primary = p_indices;
258  result.edge_indices_secondary = s_indices;
259  result.balances = balances;
260  result.objective_value = ov;
261  }
262  }
263  }
264 
265  return result;
266 }
267 
268 std::vector<PhyloFactor> phylogenetic_factorization(
269  BalanceData const& data,
270  std::function<double( std::vector<double> const& balances )> objective,
271  size_t max_iterations,
272  std::function<void( size_t iteration, size_t max_iterations )> log_progress
273 ) {
275  data,
276  [&objective]( size_t, size_t, std::vector<double> const& balances ){
277  return objective( balances );
278  },
279  max_iterations,
280  log_progress
281  );
282 }
283 
284 std::vector<PhyloFactor> phylogenetic_factorization(
285  BalanceData const& data,
287  size_t max_iterations,
288  std::function<void( size_t iteration, size_t max_iterations )> log_progress
289 ) {
290  // Basic checks.
291  if( data.tree.empty() ) {
292  return std::vector<PhyloFactor>{};
293  }
294 
295  // Start with all edges except for leaves as potential candidates for factors.
296  std::unordered_set<size_t> candidate_edges;
297  for( size_t i = 0; i < data.tree.edge_count(); ++i ) {
298  if( ! is_leaf( data.tree.edge_at(i) )) {
299  candidate_edges.insert( i );
300  }
301  }
302 
303  // Special value max iters == 0: get all factors. Also, reduce if too large.
304  if( max_iterations == 0 || max_iterations > candidate_edges.size() ) {
305  max_iterations = candidate_edges.size();
306  }
307 
308  // Successively find factors. This cannot be parallelized,
309  // as each iteration depends on all previous ones.
310  std::vector<PhyloFactor> result;
311  for( size_t it = 0; it < max_iterations; ++it ) {
312  assert( candidate_edges.size() > 0 );
313 
314  // Log the progress, if needed.
315  if( log_progress ) {
316  log_progress( it, max_iterations );
317  }
318 
319  // Find and store the next (greedy) phylo factor.
320  result.push_back( phylo_factor_find_best_edge( it, data, candidate_edges, objective ));
321 
322  // Remove its edge from the candiate list.
323  assert( candidate_edges.count( result.back().edge_index ) > 0 );
324  candidate_edges.erase( result.back().edge_index );
325  }
326 
327  return result;
328 }
329 
330 } // namespace tree
331 } // namespace genesis
genesis::tree::phylo_factor_subtree_indices
std::unordered_set< size_t > phylo_factor_subtree_indices(Subtree const &subtree, std::unordered_set< size_t > const &candidate_edges)
Helper function for phylogenetic_factorization() to find the constrained subtrees that are split by a...
Definition: phylo_factor.cpp:65
genesis::tree::Subtree::link
TreeLink const & link() const
Get the TreeLink that separates the subtree from the rest of the tree.
Definition: subtree.hpp:125
phylo_factor.hpp
genesis::tree::PhyloFactor::balances
std::vector< double > balances
The balances for all Samples calculated on the two sets of edge indices of this factor.
Definition: phylo_factor.hpp:88
genesis::tree::mass_balance
std::vector< double > mass_balance(BalanceData const &data, std::unordered_set< size_t > const &numerator_edge_indices, std::unordered_set< size_t > const &denominator_edge_indices)
Calculate the balance of edge masses between two sets of edges.
Definition: balances.cpp:357
common.hpp
balances.hpp
tree.hpp
Header of Tree class.
functions.hpp
genesis::tree::PhyloFactor::edge_index
size_t edge_index
The edge that this factor found to be maximizing for the objective function.
Definition: phylo_factor.hpp:73
genesis::tree::Tree::edge_at
TreeEdge & edge_at(size_t index)
Return the TreeEdge at a certain index.
Definition: tree/tree.hpp:238
genesis::tree::PhyloFactor
A single phylogenetic factor.
Definition: phylo_factor.hpp:68
genesis::tree::PhyloFactor::all_objective_values
std::vector< double > all_objective_values
For reference, all other objective values for the other edges of the tree.
Definition: phylo_factor.hpp:105
genesis::tree::BalanceData
Data needed to calculate balances.
Definition: balances.hpp:170
tree.hpp
genesis::tree::PhyloFactor::objective_value
double objective_value
The objective value obtained from the objective function using the balances.
Definition: phylo_factor.hpp:93
subtree.hpp
functions.hpp
genesis::tree::PhyloFactorObjectiveFunction
std::function< double(size_t iteration, size_t edge_index, std::vector< double > const &balances) > PhyloFactorObjectiveFunction
Function type used as the objective function in phylogenetic_factorization().
Definition: phylo_factor.hpp:61
genesis::tree::PhyloFactor::edge_indices_secondary
std::unordered_set< size_t > edge_indices_secondary
The set of edges on the non-root (secondary) side of the edge that belongs to this factor.
Definition: phylo_factor.hpp:83
genesis::tree::Tree::empty
bool empty() const
Return whether the Tree is empty (i.e., has no nodes, edges and links).
Definition: tree/tree.hpp:194
genesis::tree::Subtree
Reference to a subtree of a Tree.
Definition: subtree.hpp:69
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
operators.hpp
Tree operator functions.
genesis::tree::phylogenetic_factorization
std::vector< PhyloFactor > phylogenetic_factorization(BalanceData const &data, std::function< double(std::vector< double > const &balances)> objective, size_t max_iterations, std::function< void(size_t iteration, size_t max_iterations)> log_progress)
Calculate the Phylogenetic Factorization (PhyloFactor) of a set of MassTrees.
Definition: phylo_factor.cpp:268
statistics.hpp
color.hpp
Header of Color class.
functions.hpp
genesis::tree::BalanceData::tree
Tree tree
The Tree on which to calculate balances.
Definition: balances.hpp:179
genesis::tree::phylo_factor_find_best_edge
PhyloFactor phylo_factor_find_best_edge(size_t iteration, BalanceData const &data, std::unordered_set< size_t > const &candidate_edges, PhyloFactorObjectiveFunction objective)
Helper function for phylogenetic_factorization() that tries all candidate edges to find the one that ...
Definition: phylo_factor.cpp:183
genesis::tree::is_leaf
bool is_leaf(TreeLink const &link)
Return true iff the node of the given link is a leaf node.
Definition: tree/function/functions.cpp:59
preorder.hpp
genesis::tree::Tree::edge_count
size_t edge_count() const
Return the number of TreeEdges of the Tree.
Definition: tree/tree.hpp:272
genesis::tree::PhyloFactor::edge_indices_primary
std::unordered_set< size_t > edge_indices_primary
The set of edges on the root (primary) side of the edge that belongs to this factor.
Definition: phylo_factor.hpp:78