A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
taxonomy/functions/entropy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
39 
43 
44 #include <assert.h>
45 #include <algorithm>
46 #include <functional>
47 #include <limits>
48 #include <numeric>
49 #include <map>
50 #include <stdexcept>
51 
52 namespace genesis {
53 namespace taxonomy {
54 
55 // =================================================================================================
56 // Prune
57 // =================================================================================================
58 
60  Taxonomy& taxonomy,
61  size_t target_taxonomy_size,
62  PruneByEntropySettings settings
63 ) {
64 
65  // -------------------------------------------------------------------------
66  // Init
67  // -------------------------------------------------------------------------
68 
69  // Basic check.
70  if( taxa_count_lowest_levels( taxonomy ) < target_taxonomy_size ) {
71  LOG_INFO << "Taxonomy only has " + std::to_string( taxa_count_lowest_levels( taxonomy ) )
72  << " leaf Taxa. Pruning with " + std::to_string( target_taxonomy_size )
73  << " leaves thus includes the whole taxonomy.";
74 
75  // Expand fully.
76  for( auto it : preorder( taxonomy ) ) {
77  auto& status = it.taxon().data<EntropyTaxonData>().status;
78  if( it.taxon().size() == 0 ) {
80  } else {
82  }
83  }
84 
85  return;
86 
87  // throw std::runtime_error(
88  // "Taxonomy only has " + std::to_string( taxa_count_lowest_levels( taxonomy ) ) +
89  // " leaf Taxa. Cannot prune it with " + std::to_string( target_taxonomy_size ) + " leaves."
90  // );
91  }
92 
93  // Init all taxa to be outside of the pruned taxonomy.
94  preorder_for_each( taxonomy, [] ( Taxon& t ) {
96  });
97 
98  // Current taxa of the border of pruning, sorted by their entropy.
99  // We will do a preorder traversal of the Taxonomy, in order to decide which taxa make up the
100  // border of the pruning. We do not go deeper into taxa that should end up being the border.
101  // We use this list to decide which taxa to expand: Always go deeper (i.e., set status to inner)
102  // into the taxon with the highest entropy.
103  std::multimap< double, Taxon* > border_candidates;
104 
105  // Keep track of how many border taxa we have. This value should approach the target size.
106  size_t border_taxa_count = 0;
107 
108  // -------------------------------------------------------------------------
109  // Helper Functions
110  // -------------------------------------------------------------------------
111 
112  // Helper function that adds a taxon to the border and sets it as a border candidate,
113  // unless it has too few leaves. In this case, it and its children will become inner and
114  // the leaves border taxa. That is, this taxon will be fully contained and not pruned.
115  std::function< void ( Taxon& ) > add_taxon_to_border = [&] ( Taxon& taxon ) {
116 
118  LOG_WARN << "Taxon added to border not outside ("
120  << "): " << TaxopathGenerator()( taxon );
121  }
122 
123  // If the taxon has fewer leaves than the threshold (but is not a leaf itself),
124  // we make the whole sub-taxonomy a part of the pruned taxonomy.
125  if(
126  taxon.size() > 0 &&
127  settings.min_subtaxonomy_size > 0 &&
129  ) {
130 
131  // First, the taxon itself is inside.
133 
134  // Then, iterate it, and set all taxa in it to inner or
135  // border, depending on whether they are inner of leaf taxa.
136  for( auto it : preorder( taxon ) ) {
137  auto& sub_taxon_data = it.taxon().data<EntropyTaxonData>();
138  if( it.taxon().size() == 0 ) {
140  ++border_taxa_count;
141  } else {
142  sub_taxon_data.status = EntropyTaxonData::PruneStatus::kInside;
143  }
144  }
145 
146  } else if( taxon.size() == 1 ) {
147 
148  // If a Taxon has only one child, there is no need in adding this Taxon as a border
149  // candidate.
150  // We can instead directly add its child. This will not increase the resulting
151  // number of border taxa, as we still add only one. This mainly avoids to stop too
152  // early, which would result in branches of the taxonomy that only contain one
153  // child anyway, so it would make little sense to prune there.
154 
156  add_taxon_to_border( taxon.at(0) );
157 
158  } else {
159 
160  // If the taxon has more leaves than the min subtax threshold, make it a border taxon.
162  ++border_taxa_count;
163 
164  // Also, if it has children, add the taxon as a new candidate for expanding.
165  // Thus, in the pruning algorithm, it can be used to go deeper into the taxonomy.
166  if( taxon.size() > 0 ) {
167  border_candidates.emplace( taxon.data<EntropyTaxonData>().entropy, &taxon );
168  }
169  }
170  };
171 
172  // Helper function to expand a taxon, make it inner and its children new border candidates.
173  std::function< void ( Taxon& ) > expand_taxon = [&] (
174  Taxon& taxon
175  ) {
176  auto& status = taxon.data<EntropyTaxonData>().status;
177 
179  LOG_WARN << "Expanding Taxon with status "
181  << ": " << TaxopathGenerator()( taxon );
182  }
183 
184  // The taxon is expanded, thus it is inner now.
186  --border_taxa_count;
187 
188  // All its children then form the new border.
189  for( auto& child : taxon ) {
190  add_taxon_to_border( child );
191  }
192  };
193 
194  // -------------------------------------------------------------------------
195  // Min Border Level
196  // -------------------------------------------------------------------------
197 
198  // If we want to have a certain minimum level of the taxonomy fully inside the final taxonomy,
199  // do this before we start with the actual pruning algorithm.
200  if( settings.min_border_level > 0 ) {
201 
202  // Helper function that sets a taxon to inside (or border, if it is a leaf) if it has a
203  // too low level. Thus, low level taxa are fully expaneded.
204  auto include_min_level_taxa = [&] ( Taxon& taxon ) {
205 
206  // If the taxon has a lower level than the min...
207  if( taxon_level( taxon ) < settings.min_border_level ) {
208 
209  // Make it inside, if it has childern, or a border, if it is a leaf.
210  if( taxon.size() > 0 ) {
212  } else {
214  ++border_taxa_count;
215  }
216 
217  // If it is just at the level, make it a border, and make it a candidate for
218  // further expansion.
219  } else if( taxon_level( taxon ) == settings.min_border_level ) {
220  add_taxon_to_border( taxon );
221  }
222  };
223 
224  levelorder_for_each( taxonomy, include_min_level_taxa );
225  }
226 
227  // LOG_DBG << "after min border level. num cands " << border_candidates.size();
228  // LOG_DBG1 << "valid " << ( validate_pruned_taxonomy( taxonomy ) ? "1" : "nooooooooooooooo" );
229  // for( auto const& elem : border_candidates ) {
230  // LOG_DBG1 << "H\t" << elem.first << "\tname\t" << TaxopathGenerator()( *elem.second ) << "\tlevel\t"
231  // << taxon_level( *elem.second ) << "\tchildren\t" << taxa_count_lowest_levels( *elem.second );
232  // }
233 
234  // -------------------------------------------------------------------------
235  // Max Subtaxonomy Size
236  // -------------------------------------------------------------------------
237 
238  // If we want to avoid taxa that are too big, do this in a preprocessing step and only
239  // after this start the actual entropy pruning phase.
240  if( settings.max_subtaxonomy_size > 0 ) {
241 
242  // Helper function that recursively visits taxa and makes them inside taxa if they are
243  // too big. Small enough taxa will end up being the border.
244  std::function< void ( Taxon& ) > resolve_big_subtaxa = [&] (
245  Taxon& taxon
246  ) {
247  auto& status = taxon.data<EntropyTaxonData>().status;
248 
249  if(
251  || taxon.size() == 1
253  ) {
254 
255  // If the taxon has too many leaves, make it an inside taxon and recurse this
256  // function on its children (until they are all small enough).
257  // Also, if it has exactly one child, do this - for the resons explained in
258  // add_taxon_to_border().
259  // Furthermore, if it already is an inside taxon (from the min border level init),
260  // we want to keep it this way, so we simply recurse.
261 
262  // If we turn a border taxon into inner, we need to descrese the counter, and
263  // if it was an entry in the border candidates, remove it.
265 
266  auto cand = std::find_if(
267  border_candidates.begin(),
268  border_candidates.end(),
269  [ &taxon ] ( std::pair< const double, Taxon* >& entry ) {
270  return &taxon == entry.second;
271  }
272  );
273 
274  if( cand != border_candidates.end() ) {
275  // If we found the taxon in the border cand list, it means that it has
276  // children, otherwise it would never have been added there.
277  assert( taxon.size() > 0 );
278 
279  // Now, remove it.
280  border_candidates.erase( cand );
281  }
282  --border_taxa_count;
283  }
284 
286  for( auto& child : taxon ) {
287  resolve_big_subtaxa( child );
288  }
289 
290  } else if(
292  ) {
293 
294  // If the taxon is small enough, do not recurse.
295  // Instead, add the taxon as a border candidate.
296  // We do not want to change the state of their children then - thus, those stay
297  // outside for the moment. Later, the entropy pruning phase can then start from there.
298  add_taxon_to_border( taxon );
299 
300  } else {
301 
302  // The only case that is not treated so far is a taxon that is already border,
303  // but is smaller than the max subtax size, so we do not need to do anything, as
304  // this taxon is already properly in the list.
305  assert( status == EntropyTaxonData::PruneStatus::kBorder );
306  }
307  };
308 
309  // Run the resolve helper function for the taxonomy.
310  for( auto& taxon : taxonomy ) {
311  resolve_big_subtaxa( taxon );
312  }
313  }
314 
315  // LOG_DBG << "after max subtax size. num cands " << border_candidates.size();
316  // LOG_DBG1 << "valid " << ( validate_pruned_taxonomy( taxonomy ) ? "1" : "nooooooooooooooo" );
317  // for( auto const& elem : border_candidates ) {
318  // LOG_DBG1 << "H\t" << elem.first << "\tname\t" << TaxopathGenerator()( *elem.second ) << "\tlevel\t"
319  // << taxon_level( *elem.second ) << "\tchildren\t" << taxa_count_lowest_levels( *elem.second );
320  // }
321 
322  // -------------------------------------------------------------------------
323  // Default Init
324  // -------------------------------------------------------------------------
325 
326  // If we use neither of the min border level or max subtax size settings, we need
327  // to init the border front with just the first level of the taxonomy.
328  if( settings.min_border_level == 0 && settings.max_subtaxonomy_size == 0 ) {
329 
330  // Init with first level. See expand_taxon() for details.
331  for( auto& child : taxonomy ) {
332  add_taxon_to_border( child );
333  }
334  }
335 
336  // -------------------------------------------------------------------------
337  // Main Loop
338  // -------------------------------------------------------------------------
339 
340  // Loop until we have done enough pruning, i.e., if we exceeded the target size.
341  while( border_taxa_count < target_taxonomy_size ) {
342 
343  // We already checked that we will have enough leaf taxa to achieve the target size.
344  // So, there should always be candidates to choose for pruning.
345  assert( border_candidates.size() > 0 );
346 
347  // Get the taxon with the highest entropy from the candidates list. This will be the front
348  // for going deeper into the taxonomy, i.e., for going from border to inner.
349  // Assert that the iterators do their thing correctly and the last element is the last.
350  // (Those reverse iterations freak me out. They provoke off-by-one errors.)
351  auto cur_front = *border_candidates.rbegin();
352  assert( *std::prev(border_candidates.end()) == *border_candidates.rbegin() );
353 
354  // Remove the current front from the candidates, because we are about to use it for pruning.
355  // Assert that we removed it - the removal of reverse iterators is a bit weird, so better
356  // make sure.
357  border_candidates.erase( --border_candidates.rbegin().base() );
358  assert( cur_front != *border_candidates.rbegin() );
359 
360  // The current front taxon was considered a border taxon before.
361  assert(
362  cur_front.second->data<EntropyTaxonData>().status ==
364  );
365 
366  // Also, it has to have children, otherwise we would not have wanted it in the candidate
367  // list in the first place.
368  assert( cur_front.second->size() > 0 );
369 
370  // If we go into the front taxon, but achieve a new size that is further away from
371  // our target size, we don't do go deeper.
372  if( utils::abs_diff( border_taxa_count, target_taxonomy_size ) <
373  utils::abs_diff( border_taxa_count + cur_front.second->size(), target_taxonomy_size )
374  ) {
375  // If we allow approximation, we will continue with the loop, which means, we will use
376  // taxa with a lower entropy as pruning border. If we don't allow this, we are done here.
377  if( settings.allow_approximation ) {
378  continue;
379  } else {
380  break;
381  }
382  }
383 
384  // Prune at the front by making it an inside taxon, and its children the new border.
385  expand_taxon( *cur_front.second );
386  }
387 
388  // Output the entropy of the next split candidate. This is an indicator where the
389  // entropy threshold for splitting is.
390  // if( border_candidates.size() > 0 ) {
391  // auto cur_front = *border_candidates.rbegin();
392  // LOG_DBG << "entropy end: " << cur_front.first;
393  // }
394 
395  // Output the rest of the candidates, for debugging.
396  // LOG_DBG << "end of function. num candidates " << border_candidates.size();
397  // LOG_DBG1 << "prune border branches count " << count_taxa_with_prune_status(
398  // taxonomy, EntropyTaxonData::PruneStatus::kBorder
399  // );
400  // LOG_DBG1 << "valid " << ( validate_pruned_taxonomy( taxonomy ) ? "1" : "nooooooooooooooo" );
401  // for( auto const& elem : border_candidates ) {
402  // LOG_DBG1 << "H\t" << elem.first << "\tname\t" << TaxopathGenerator()( *elem.second ) << "\tlevel\t"
403  // << taxon_level( *elem.second ) << "\tchildren\t" << taxa_count_lowest_levels( *elem.second );
404  // }
405  // LOG_DBG << "out";
406 }
407 
408 // =================================================================================================
409 // Helper Functions
410 // =================================================================================================
411 
413  Taxonomy& taxonomy,
414  size_t min_subtaxonomy_size
415 ) {
416  for( auto& taxon : taxonomy ) {
417  auto status = taxon.data<EntropyTaxonData>().status;
418 
419  // Recurse
421  expand_small_subtaxonomies( taxon, min_subtaxonomy_size );
422  }
423 
424  // If the taxon has fewer leaves than the threshold (but is not a leaf itself),
425  // we make the whole sub-taxonomy a part of the pruned taxonomy.
426  if(
428  taxon.size() > 0 &&
429  taxa_count_lowest_levels( taxon ) < min_subtaxonomy_size
430  ) {
431 
432  // First, the taxon itself is inside.
434 
435  // Then, iterate it, and set all taxa in it to inner or
436  // border, depending on whether they are inner of leaf taxa.
437  for( auto it : preorder( taxon ) ) {
438  auto& sub_taxon_data = it.taxon().data<EntropyTaxonData>();
439  if( it.taxon().size() == 0 ) {
441  } else {
442  sub_taxon_data.status = EntropyTaxonData::PruneStatus::kInside;
443  }
444  }
445  }
446  }
447 }
448 
450  Taxonomy const& taxonomy,
452 ) {
453  size_t count = 0;
454  auto do_count = [&] ( Taxon const& taxon ) {
455  if( taxon.data<EntropyTaxonData>().status == status ) {
456  ++count;
457  }
458  };
459  preorder_for_each( taxonomy, do_count );
460  return count;
461 }
462 
464 {
465  auto do_removal = [&] ( Taxon& taxon ) {
467  taxon.clear_children();
468  }
469  };
470  preorder_for_each( taxonomy, do_removal );
471 }
472 std::string print_pruned_taxonomy( Taxonomy const& taxonomy )
473 {
474  std::string result;
475  auto print_taxon = [&] ( Taxon const& taxon ) {
476  result += std::string( taxon_level(taxon) * 4, ' ' );
478  result += utils::Style("Red")(taxon.name());
479  } else {
480  result += taxon.name();
481  }
482  if( taxon.data<EntropyTaxonData>().entropy > 0 ) {
483  result += " (" + std::to_string( taxon.data<EntropyTaxonData>().entropy ) + ")";
484  }
485  result += "\n";
486  };
487  preorder_for_each( taxonomy, print_taxon );
488  return result;
489 }
490 
491 bool validate_pruned_taxonomy( Taxonomy const& taxonomy )
492 {
493  // Currently, because of the iterators, we need to always traverse the whole taxonomy.
494  // Works for now, but should be speed up in the future with proper iterators.
495  bool correct = true;
496 
497  auto check_parents = [&] ( Taxon const& taxon ) {
498 
499  // Need to have correct data type. We check it here. In the while loop later, we don't
500  // have to: we are doing preorder traversal, so for each call of check_parents(), the
501  // parents were already checked.
502  if( taxon.data_cast<EntropyTaxonData>() == nullptr ) {
503  auto name = TaxopathGenerator()( taxon );
504  LOG_INFO << "Taxon with incorrect data type (not EntropyTaxonData): " << name;
505 
506  correct = false;
507  return;
508  }
509 
510  // Store the status of the current child. We'll move up the taxonomic hierarchy and check
511  // whether all parents of this child are conform with the prune status rules.
512  auto child_status = taxon.data<EntropyTaxonData>().status;
513 
514  // Check leaf state.
515  if( taxon.size() == 0 && child_status == EntropyTaxonData::PruneStatus::kInside ) {
516  auto name = TaxopathGenerator()( taxon );
517  LOG_INFO << "Taxon is a leaf but has status 'kInside': " << name;
518  correct = false;
519  }
520 
521  auto cur_ptr = taxon.parent();
522  while( cur_ptr != nullptr ) {
523  auto cur_status = cur_ptr->data<EntropyTaxonData>().status;
524 
525  if(
528  ) {
529  // Do nothing, all good.
530  } else if(
533  ) {
534  // Do nothing, all good.
535  } else if(
538  ) {
539  child_status = cur_status;
540  } else if(
543  ) {
544  child_status = cur_status;
545  } else {
546  auto name = TaxopathGenerator()( taxon );
547  LOG_INFO << "Taxon and child with wrong pruning status ("
548  << EntropyTaxonData::status_abbreviation( cur_status ) << "/"
549  << EntropyTaxonData::status_abbreviation( child_status ) << "): "
550  << name;
551 
552  correct = false;
553  return;
554  }
555 
556  cur_ptr = cur_ptr->parent();
557  }
558  };
559  preorder_for_each( taxonomy, check_parents );
560 
561  return correct;
562 }
563 
564 
565 /*
566 
567  Old functions for testing. Did not use the EntropyTaxonData, but lists with pointers into
568  the Taxonomy. Thus, ugly and and error prone. If needed, refactor first!
569 
570 / **
571  * @brief Split a Taxonomy at Taxa that exceed a certain entropy threshold.
572  *
573  * This is mainly a test method, as it is currently not further used.
574  * /
575 std::unordered_set< Taxon const* > split_taxonomy_by_entropy_threshold(
576  Taxonomy const& taxonomy,
577  std::unordered_map< Taxon const*, double > const& entropies,
578  double entropy_threshold
579 ) {
580  // Resulting list of Taxa where to split.
581  std::unordered_set< Taxon const* > crop_list;
582 
583  // Fill a stack of taxa with the first level of the Taxonomy.
584  // We will do a preorder traversal, but do not go deeper into branches that we do not want
585  // to split further. Thus, use this stack to keep track of which Taxa still need to be visited.
586  std::stack< Taxon const* > taxa_stack;
587  for( auto const& t : taxonomy ) {
588  taxa_stack.push( &t );
589  }
590 
591  // Iterate the taxonomy and either decide to not split but keep a certain Taxon
592  // (if its entropy is below threshold), or go deeper into the Taxon by adding it to the stack,
593  // so that it is also iterated and split at a deeper level.
594  while( ! taxa_stack.empty() ) {
595  auto const& cur = *taxa_stack.top();
596  taxa_stack.pop();
597 
598  // Make sure we only process each element once. Not all taxa end of in the split list,
599  // but none should be in there more than once.
600  assert( crop_list.count( &cur ) == 0 );
601 
602  // Make sure that the entropy has entries that belong to the taxonomy.
603  if( entropies.count( &cur ) == 0 ) {
604  auto name = TaxopathGenerator()( cur );
605  throw std::runtime_error( "Entropy list not complete. Missing Taxon " + name );
606  }
607 
608  // If the Taxon has a low entropy, it's sequences are similar to each other, so we can
609  // keep it as it is. Thus, no need to split it further, so add it to the list.
610  // Also, if it is a leaf of the taxonomy, we will not further traverse it, so add it.
611  if( entropies.at( &cur ) <= entropy_threshold || cur.size() == 0 ) {
612  crop_list.insert( &cur );
613 
614  // If the entropy is high, go deepter into it.
615  } else {
616  for( auto& t : cur ) {
617  taxa_stack.push( &t );
618  }
619  }
620  }
621 
622  return crop_list;
623 }
624 
625 / **
626  * @brief Test method for splitting a Taxonomy using nested intervals.
627  *
628  * This method is a test whether a Taxonomy can be splitted into low entropy regions using
629  * nested intervals. Did not work, as the entropy per Taxon is not monotonic in the hierarchy.
630  * /
631 std::unordered_set< Taxon const* > split_taxonomy_by_entropy_nested_invervals(
632  Taxonomy const& taxonomy,
633  std::unordered_map< Taxon const*, double > const& entropies,
634  size_t target_taxonomy_size
635 ) {
636  // Init the entropy limits used for the nested intervals loop.
637  // We will approach the best threshold from the min and max value, startig at the average.
638  double lower_limit = std::numeric_limits<double>::infinity();
639  double upper_limit = 0.0;
640  double average = 0.0;
641  for( auto const& elem : entropies ) {
642  if( elem.second < 0.0 ) {
643  throw std::runtime_error( "Invalid entropy value < 0.0." );
644  }
645 
646  lower_limit = std::min( lower_limit, elem.second );
647  upper_limit = std::max( upper_limit, elem.second );
648  average += elem.second;
649  }
650  average /= static_cast< double >( entropies.size() );
651 
652  // Check invariants for the limits.
653  assert( lower_limit <= average );
654  assert( average <= upper_limit );
655 
656  // Start the iterative process with the average threshold.
657  double threshold = average;
658 
659  // Target list: store the leaf taxa of the taxonomy.
660  std::unordered_set< Taxon const* > crop_list;
661 
662  while( true ) {
663  // Split the taxonomy using the current threshold.
664  auto cand_list = split_taxonomy_by_entropy_threshold( taxonomy, entropies, threshold );
665 
666  // If we are closer to our target size, update the list.
667  if( utils::abs_diff( cand_list.size(), target_taxonomy_size ) <
668  utils::abs_diff( crop_list.size(), target_taxonomy_size )
669  ) {
670  crop_list = cand_list;
671  }
672 
673  // Adjust the nested invervals, or finish.
674  // If the list is too big, use a higher threshold,
675  // if it is too small, use a lower one.
676  // If we hit the target size, we can stop.
677  if( cand_list.size() > target_taxonomy_size ) {
678  lower_limit = threshold;
679  threshold = ( threshold + upper_limit ) / 2.0;
680 
681  } else
682  if( cand_list.size() < target_taxonomy_size ) {
683  upper_limit = threshold;
684  threshold = ( threshold + lower_limit ) / 2.0;
685 
686  } else {
687  break;
688  }
689 
690  // Check invariants for the limits.
691  assert( lower_limit <= threshold );
692  assert( threshold <= upper_limit );
693 
694  // Last resort: exit condition based on nesting depth.
695  // Only in rare cases, we will exaclty hit the target size. Usually, we will jump back
696  // and forth between a value too low and one too high. Then, at some point, the interval
697  // converges at the entropy value that separates those two split candidates.
698  // If we converged enough, we can stop, there won't be a better split candidate.
699  if( upper_limit - lower_limit < 1.0E-10 * average ) {
700  break;
701  }
702 
703  // Test: Instead of nested intervals, simply try every n-th interval.
704  // threshold += (upper_limit - lower_limit) / 200.0;
705  // if( threshold > upper_limit ) {
706  // break;
707  // }
708  }
709 
710  return crop_list;
711 }
712 */
713 
714 } // namespace taxonomy
715 } // namespace genesis
size_t taxon_level(Taxon const &taxon)
Return the level of depth of a given Taxon.
Helper class to generate a taxonomic path string from a Taxopath object or a Taxon.
size_t max_subtaxonomy_size
Maximal size of a sub-taxonomy of the pruned Taxonomy. Default is 0.
utils::Range< IteratorPreorder< Taxonomy const, Taxon const > > preorder(TaxonomyType const &taxonomy)
void levelorder_for_each(Taxonomy &tax, std::function< void(Taxon &)> fn, bool include_inner_taxa=true)
Apply a function to all taxa of the Taxonomy, traversing it in levelorder.
bool validate_pruned_taxonomy(Taxonomy const &taxonomy)
Validate that the pruning status of a Taxonomy is valid.
#define LOG_WARN
Log a warning. See genesis::utils::LoggingLevel.
Definition: logging.hpp:95
size_t taxa_count_lowest_levels(Taxonomy const &tax)
Return the number of lowest level Taxa (i.e., taxa without sub-taxa) in the Taxonomy.
Simple text style class for colorized and bold output to a terminal.
Definition: style.hpp:81
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
void preorder_for_each(Taxonomy &tax, std::function< void(Taxon &)> fn, bool include_inner_taxa=true)
Apply a function to all taxa of the Taxonomy, traversing it in preorder.
void remove_pruned_taxonomy_children(Taxonomy &taxonomy)
Remove the children of all Taxa that are pruned, i.e, that have prune status == kOutside.
static std::string status_abbreviation(PruneStatus stat)
size_t min_subtaxonomy_size
Minimal size of a sub-taxonomy of the pruned Taxonomy. Default is 0.
Store a Taxon, i.e., an element in a Taxonomy, with its name, rank, ID and sub-taxa.
Definition: taxon.hpp:76
size_t count_taxa_with_prune_status(Taxonomy const &taxonomy, EntropyTaxonData::PruneStatus status)
Return the number of Taxa that have a certain prune status.
bool allow_approximation
Allow some approximation in order to get closer to the target pruning size.
Store settings for the Taxonomy pruning algorithm prune_by_entropy().
void prune_by_entropy(Taxonomy &taxonomy, size_t target_taxonomy_size, PruneByEntropySettings settings)
Prune a Taxonomy so that the result (approximately) contains a desired number of "leaf" Taxa...
Provides easy and fast logging functionality.
size_t min_border_level
Minimum level of the Taxa that are considered inside for pruning. Default is 0.
void expand_small_subtaxonomies(Taxonomy &taxonomy, size_t min_subtaxonomy_size)
Expand the leaves of a pruned Taxonomy if their sub-taxonomies are smaller than the given threshold...
std::string print_pruned_taxonomy(Taxonomy const &taxonomy)
Print a Taxonomy, highlighting those Taxa that are the pruning border, i.e., where we cut off the sub...
TaxonDataType & data()
Definition: taxon.hpp:197
constexpr T abs_diff(T const &lhs, T const &rhs)
Calculate the absolute differenence between two values.
Definition: common.hpp:62
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98