A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
functions/taxonomy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
39 
40 #include <algorithm>
41 #include <assert.h>
42 #include <ostream>
43 #include <stdexcept>
44 #include <string>
45 #include <vector>
46 
47 namespace genesis {
48 namespace taxonomy {
49 
50 // =================================================================================================
51 // Accessors
52 // =================================================================================================
53 
57 Taxon const* find_taxon_by_name( Taxonomy const& tax, std::string const& name )
58 {
59  for( auto const& c : tax ) {
60  if( c.name() == name ) {
61  return &c;
62  }
63  auto rec = find_taxon_by_name( c, name );
64  if( rec != nullptr ) {
65  return rec;
66  }
67  }
68  return nullptr;
69 }
70 
74 Taxon* find_taxon_by_name( Taxonomy& tax, std::string const& name )
75 {
76  // Avoid code duplication according to Scott Meyers.
77  auto const& ctax = static_cast< Taxonomy const& >( tax );
78  return const_cast< Taxon* >( find_taxon_by_name( ctax, name ));
79 }
80 
88 size_t taxon_level( Taxon const& taxon )
89 {
90  size_t res = 0;
91  Taxon const* parent = taxon.parent();
92  while( parent != nullptr ) {
93  parent = parent->parent();
94  ++res;
95  }
96  return res;
97 }
98 
114 size_t total_taxa_count( Taxonomy const& tax )
115 {
116  size_t count = tax.size();
117  for( auto const& t : tax ) {
118  count += total_taxa_count( t );
119  }
120  return count;
121 }
122 
142 size_t taxa_count_lowest_levels( Taxonomy const& tax )
143 {
144  size_t count = 0;
145  for( auto const& t : tax ) {
146  if( t.size() == 0 ) {
147  ++count;
148  } else {
149  count += taxa_count_lowest_levels( t );
150  }
151  }
152  return count;
153 }
154 
166 size_t taxa_count_at_level( Taxonomy const& tax, size_t level )
167 {
168  // Recursive implementation, because we are lazy.
169  size_t count = 0;
170  if( level == 0 ) {
171  count += tax.size();
172  } else {
173  for( auto& c : tax ) {
174  count += taxa_count_at_level( c, level - 1 );
175  }
176  }
177  return count;
178 }
179 
191 std::vector< size_t > taxa_count_levels( Taxonomy const& tax )
192 {
193  if( tax.size() == 0 ) {
194  return std::vector< size_t >();
195  }
196 
197  std::vector< size_t > result( 1, 0 );
198  result[ 0 ] = tax.size();
199 
200  for( auto const& child : tax ) {
201  auto cres = taxa_count_levels( child );
202 
203  if( result.size() < cres.size() + 1 ) {
204  result.resize( cres.size() + 1, 0 );
205  }
206 
207  for( size_t i = 0; i < cres.size(); ++i ) {
208  result[ i+1 ] += cres[ i ];
209  }
210  }
211  return result;
212 }
213 
225  Taxonomy const& tax,
226  std::string const& rank,
227  bool case_sensitive
228 ) {
229  size_t count = 0;
230  for( auto const& t : tax ) {
231 
232  // Count.
233  if( case_sensitive ) {
234  if( t.rank() == rank ) {
235  ++count;
236  }
237  } else {
238  if( utils::equals_ci( t.rank(), rank )) {
239  ++count;
240  }
241  }
242 
243  // Recurse.
244  count += taxa_count_with_rank( t, rank, case_sensitive );
245  }
246  return count;
247 }
248 
264 std::unordered_map< std::string, size_t> taxa_count_ranks(
265  Taxonomy const& tax,
266  bool case_sensitive
267 ) {
268  std::unordered_map< std::string, size_t> result;
269 
270  for( auto const& taxon : tax ) {
271  if( case_sensitive ) {
272  ++result[ taxon.rank() ];
273  } else {
274  ++result[ utils::to_lower( taxon.rank() ) ];
275  }
276 
277  // Recurse.
278  auto children = taxa_count_ranks( taxon, case_sensitive );
279  for( auto const& ctaxon : children ) {
280  result[ ctaxon.first ] += ctaxon.second;
281  }
282  }
283 
284  return result;
285 }
286 
287 // =================================================================================================
288 // Modifiers
289 // =================================================================================================
290 
303 void sort_by_name( Taxonomy& tax, bool recursive, bool case_sensitive )
304 {
305  // Make two functions for case sensitive and insensitive comparison.
306  auto comp_by_name_cs = []( Taxon const& lhs, Taxon const& rhs ) {
307  return lhs.name() < rhs.name();
308  };
309  auto comp_by_name_ci = []( Taxon const& lhs, Taxon const& rhs ) {
310  return utils::to_lower( lhs.name() ) < utils::to_lower( rhs.name() );
311  };
312 
313  // Sort.
314  if( case_sensitive ) {
315  std::sort( tax.begin(), tax.end(), comp_by_name_cs );
316  } else {
317  std::sort( tax.begin(), tax.end(), comp_by_name_ci );
318  }
319 
320  // Run recursion if necessary.
321  if( recursive ) {
322  for( auto& child : tax ) {
323  sort_by_name( child, true, case_sensitive );
324  }
325  }
326 }
327 
338 void remove_taxa_at_level( Taxonomy& tax, size_t level )
339 {
340  // Recursive implementation, because we are lazy.
341  if( level == 0 ) {
342  tax.clear_children();
343  } else {
344  for( auto& c : tax ) {
345  remove_taxa_at_level( c, level - 1 );
346  }
347  }
348 }
349 
350 // =================================================================================================
351 // Print and Output
352 // =================================================================================================
353 
360 std::ostream& operator << ( std::ostream& out, Taxonomy const& tax )
361 {
362  // We use a nested printer with max 10 lines per default.
363  auto printer = PrinterNested();
364  printer.line_limit( 10 );
365  printer.print( out, tax );
366  return out;
367 }
368 
383 bool validate( Taxonomy const& taxonomy, bool stop_at_first_error )
384 {
385  // Local recursive implementation of the function.
386  std::function< bool ( Taxonomy const&, bool, size_t ) > validate_rec = [&] (
387  Taxonomy const& tax, bool stop_at_first_error, size_t level
388  ) {
389  bool res = true;
390  for( auto const& c : tax ) {
391 
392  // Check current parent.
393  if( (level == 0 && c.parent() != nullptr) || (level > 0 && c.parent() != &tax) ) {
394  LOG_INFO << "Taxon child with invalid parent pointer: " << c.name();
395  if( stop_at_first_error ) {
396  return false;
397  } else {
398  res = false;
399  }
400  }
401 
402  // Recurse.
403  if( ! validate_rec( c, stop_at_first_error, level + 1 )) {
404  if( stop_at_first_error ) {
405  return false;
406  } else {
407  res = false;
408  }
409  }
410  }
411  return res;
412  };
413 
414  return validate_rec( taxonomy, stop_at_first_error, 0 );
415 }
416 
417 } // namespace taxonomy
418 } // namespace genesis
size_t taxon_level(Taxon const &taxon)
Return the level of depth of a given Taxon.
bool validate(Taxonomy const &taxonomy, bool stop_at_first_error)
Validate the internal data structures of a Taxonomy and its child Taxa Taxa.
iterator begin()
Return an iterator to the beginning of the child taxa.
Definition: taxonomy.cpp:290
std::string const & name() const
Return the name of this taxon.
Definition: taxon.cpp:169
std::vector< size_t > taxa_count_levels(Taxonomy const &tax)
Count the number of Taxa at each level of depth in the Taxonomy.
size_t taxa_count_lowest_levels(Taxonomy const &tax)
Return the number of lowest level Taxa (i.e., taxa without sub-taxa) in the Taxonomy.
Taxon const * find_taxon_by_name(Taxonomy const &tax, std::string const &name)
Find a Taxon with a given name by recursively searching the Taxonomy.
std::string to_lower(std::string const &str)
Return an all-lowercase copy of the given string, locale-aware.
Definition: string.hpp:198
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
Simple printer class for Taxonomy.
Definition: nested.hpp:53
bool equals_ci(std::string const &lhs, std::string const &rhs)
Compare two strings case insensitive.
Definition: string.cpp:59
Store a Taxon, i.e., an element in a Taxonomy, with its name, rank and sub-taxa.
Definition: taxon.hpp:76
std::ostream & operator<<(std::ostream &out, Taxonomy const &tax)
Print the contents of a Taxonomy, i.e., all nested taxa, up to a limit of 10.
void remove_taxa_at_level(Taxonomy &tax, size_t level)
Remove all Taxa at a given level of depth in the Taxonomy hierarchy, and all their children...
Provides some commonly used string utility functions.
Provides easy and fast logging functionality.
std::unordered_map< std::string, size_t > taxa_count_ranks(Taxonomy const &tax, bool case_sensitive)
Count the number of Taxa in a Taxonomy per rank.
size_t size() const
Return the number of immediate child Taxa.
Definition: taxonomy.cpp:113
iterator end()
Return an iterator to the end of the child taxa.
Definition: taxonomy.cpp:298
void sort_by_name(Taxonomy &tax, bool recursive, bool case_sensitive)
Sort the Taxa of a Taxonomy by their name.
size_t total_taxa_count(Taxonomy const &tax)
Return the total number of taxa contained in the Taxomony, i.e., the number of (non-unique) names of ...
size_t taxa_count_with_rank(Taxonomy const &tax, std::string const &rank, bool case_sensitive)
Count the number of Taxa in a Taxonomy that have a certain rank assigned to them. ...
Taxon const * parent() const
Return a pointer to the parent of this taxon, or a nullptr if this is the top level taxon...
Definition: taxon.cpp:209
void clear_children()
Remove all children.
Definition: taxonomy.cpp:278
size_t taxa_count_at_level(Taxonomy const &tax, size_t level)
Count the number of Taxa at a certain level of depth in the Taxonomy.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98