A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
functions/taxonomy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
38 
39 #include <algorithm>
40 #include <assert.h>
41 #include <ostream>
42 #include <stdexcept>
43 #include <string>
44 #include <unordered_set>
45 #include <vector>
46 
47 namespace genesis {
48 namespace taxonomy {
49 
50 // =================================================================================================
51 // Find Functions
52 // =================================================================================================
53 
54 Taxon const* find_taxon_by_name( Taxonomy const& tax, std::string const& name )
55 {
56  return find_taxon_by_name( tax, name, DepthFirstSearch{} );
57 }
58 
59 Taxon* find_taxon_by_name( Taxonomy& tax, std::string const& name )
60 {
61  return find_taxon_by_name( tax, name, DepthFirstSearch{} );
62 }
63 
64 Taxon const* find_taxon_by_id( Taxonomy const& tax, std::string const& id )
65 {
66  return find_taxon_by_id( tax, id, DepthFirstSearch{} );
67 }
68 
69 Taxon* find_taxon_by_id( Taxonomy& tax, std::string const& id )
70 {
71  return find_taxon_by_id( tax, id, DepthFirstSearch{} );
72 }
73 
74 // =================================================================================================
75 // Accessors
76 // =================================================================================================
77 
78 size_t taxon_level( Taxon const& taxon )
79 {
80  size_t res = 0;
81  Taxon const* parent = taxon.parent();
82  while( parent != nullptr ) {
83  parent = parent->parent();
84  ++res;
85  }
86  return res;
87 }
88 
89 size_t total_taxa_count( Taxonomy const& tax )
90 {
91  size_t count = tax.size();
92  for( auto const& t : tax ) {
93  count += total_taxa_count( t );
94  }
95  return count;
96 }
97 
98 size_t taxa_count_lowest_levels( Taxonomy const& tax )
99 {
100  size_t count = 0;
101  for( auto const& t : tax ) {
102  if( t.size() == 0 ) {
103  ++count;
104  } else {
105  count += taxa_count_lowest_levels( t );
106  }
107  }
108  return count;
109 }
110 
111 size_t taxa_count_at_level( Taxonomy const& tax, size_t level )
112 {
113  // Recursive implementation, because we are lazy.
114  size_t count = 0;
115  if( level == 0 ) {
116  count += tax.size();
117  } else {
118  for( auto& c : tax ) {
119  count += taxa_count_at_level( c, level - 1 );
120  }
121  }
122  return count;
123 }
124 
125 std::vector< size_t > taxa_count_levels( Taxonomy const& tax )
126 {
127  if( tax.size() == 0 ) {
128  return std::vector< size_t >();
129  }
130 
131  std::vector< size_t > result( 1, 0 );
132  result[ 0 ] = tax.size();
133 
134  for( auto const& child : tax ) {
135  auto cres = taxa_count_levels( child );
136 
137  if( result.size() < cres.size() + 1 ) {
138  result.resize( cres.size() + 1, 0 );
139  }
140 
141  for( size_t i = 0; i < cres.size(); ++i ) {
142  result[ i+1 ] += cres[ i ];
143  }
144  }
145  return result;
146 }
147 
149  Taxonomy const& tax,
150  std::string const& rank,
151  bool case_sensitive
152 ) {
153  size_t count = 0;
154  for( auto const& t : tax ) {
155 
156  // Count.
157  if( case_sensitive ) {
158  if( t.rank() == rank ) {
159  ++count;
160  }
161  } else {
162  if( utils::equals_ci( t.rank(), rank )) {
163  ++count;
164  }
165  }
166 
167  // Recurse.
168  count += taxa_count_with_rank( t, rank, case_sensitive );
169  }
170  return count;
171 }
172 
173 std::unordered_map< std::string, size_t> taxa_count_ranks(
174  Taxonomy const& tax,
175  bool case_sensitive
176 ) {
177  std::unordered_map< std::string, size_t> result;
178 
179  for( auto const& taxon : tax ) {
180  if( case_sensitive ) {
181  ++result[ taxon.rank() ];
182  } else {
183  ++result[ utils::to_lower( taxon.rank() ) ];
184  }
185 
186  // Recurse.
187  auto children = taxa_count_ranks( taxon, case_sensitive );
188  for( auto const& ctaxon : children ) {
189  result[ ctaxon.first ] += ctaxon.second;
190  }
191  }
192 
193  return result;
194 }
195 
196 bool has_unique_ids( Taxonomy const& tax )
197 {
198  std::unordered_set<std::string> ids;
199  bool has_duplicates = false;
200 
201  auto collect_and_check = [&]( Taxon const& tax ){
202  if( ids.count( tax.id() ) > 0 ) {
203  has_duplicates = true;
204  return;
205  }
206  ids.insert( tax.id() );
207  };
208  preorder_for_each( tax, collect_and_check );
209 
210  return ! has_duplicates;
211 }
212 
213 // =================================================================================================
214 // Modifiers
215 // =================================================================================================
216 
217 void sort_by_name( Taxonomy& tax, bool recursive, bool case_sensitive )
218 {
219  // Make two functions for case sensitive and insensitive comparison.
220  auto comp_by_name_cs = []( Taxon const& lhs, Taxon const& rhs ) {
221  return lhs.name() < rhs.name();
222  };
223  auto comp_by_name_ci = []( Taxon const& lhs, Taxon const& rhs ) {
224  return utils::to_lower( lhs.name() ) < utils::to_lower( rhs.name() );
225  };
226 
227  // Sort.
228  if( case_sensitive ) {
229  // std::sort( tax.begin(), tax.end(), comp_by_name_cs );
230  tax.sort( comp_by_name_cs );
231  } else {
232  // std::sort( tax.begin(), tax.end(), comp_by_name_ci );
233  tax.sort( comp_by_name_ci );
234  }
235 
236  // Run recursion if necessary.
237  if( recursive ) {
238  for( auto& child : tax ) {
239  sort_by_name( child, true, case_sensitive );
240  }
241  }
242 }
243 
244 void remove_taxa_at_level( Taxonomy& tax, size_t level )
245 {
246  // Recursive implementation, because we are lazy.
247  if( level == 0 ) {
248  tax.clear_children();
249  } else {
250  for( auto& c : tax ) {
251  remove_taxa_at_level( c, level - 1 );
252  }
253  }
254 }
255 
256 // =================================================================================================
257 // Print and Output
258 // =================================================================================================
259 
260 std::ostream& operator << ( std::ostream& out, Taxonomy const& tax )
261 {
262  // We use a nested printer with max 10 lines per default.
263  auto printer = PrinterNested();
264  printer.line_limit( 10 );
265  printer.print( out, tax );
266  return out;
267 }
268 
269 bool validate( Taxonomy const& taxonomy, bool stop_at_first_error )
270 {
271  // Local recursive implementation of the function.
272  std::function< bool ( Taxonomy const&, bool, size_t ) > validate_rec = [&] (
273  Taxonomy const& tax, bool stop_at_first_error, size_t level
274  ) {
275  bool res = true;
276  for( auto const& c : tax ) {
277 
278  // Check current parent.
279  if( (level == 0 && c.parent() != nullptr) || (level > 0 && c.parent() != &tax) ) {
280  LOG_INFO << "Taxon child with invalid parent pointer: " << c.name();
281  if( stop_at_first_error ) {
282  return false;
283  } else {
284  res = false;
285  }
286  }
287 
288  // Recurse.
289  if( ! validate_rec( c, stop_at_first_error, level + 1 )) {
290  if( stop_at_first_error ) {
291  return false;
292  } else {
293  res = false;
294  }
295  }
296  }
297  return res;
298  };
299 
300  return validate_rec( taxonomy, stop_at_first_error, 0 );
301 }
302 
303 } // namespace taxonomy
304 } // namespace genesis
size_t taxon_level(Taxon const &taxon)
Return the level of depth of a given Taxon.
std::string const & name() const
Return the name of this taxon.
Definition: taxon.cpp:131
bool validate(Taxonomy const &taxonomy, bool stop_at_first_error)
Validate the internal data structures of a Taxonomy and its child Taxa Taxa.
std::unordered_map< std::string, size_t > taxa_count_ranks(Taxonomy const &tax, bool case_sensitive)
Count the number of Taxa in a Taxonomy per rank.
size_t taxa_count_with_rank(Taxonomy const &tax, std::string const &rank, bool case_sensitive)
Count the number of Taxa in a Taxonomy that have a certain rank assigned to them. ...
std::vector< size_t > taxa_count_levels(Taxonomy const &tax)
Count the number of Taxa at each level of depth in the Taxonomy.
size_t taxa_count_lowest_levels(Taxonomy const &tax)
Return the number of lowest level Taxa (i.e., taxa without sub-taxa) in the Taxonomy.
Taxon const * find_taxon_by_id(Taxonomy const &tax, std::string const &id)
Alias for find_taxon_by_id(..., DepthFirstSearch{}).
Taxon const * find_taxon_by_name(Taxonomy const &tax, std::string const &name)
Alias for find_taxon_by_name(..., DepthFirstSearch{}).
bool has_unique_ids(Taxonomy const &tax)
Return true iff all IDs of the Taxa in the Taxonomy are unique.
std::string to_lower(std::string const &str)
Return an all-lowercase copy of the given string, locale-aware.
Definition: string.hpp:206
Store a Taxonomy, i.e., a nested hierarchy of Taxa.
void preorder_for_each(Taxonomy &tax, std::function< void(Taxon &)> fn, bool include_inner_taxa=true)
Apply a function to all taxa of the Taxonomy, traversing it in preorder.
Simple printer class for Taxonomy.
Definition: nested.hpp:53
void sort(Compare comp)
Sort the taxonomy according to some compare criterion.
bool equals_ci(std::string const &lhs, std::string const &rhs)
Compare two strings case insensitive.
Definition: string.cpp:60
Store a Taxon, i.e., an element in a Taxonomy, with its name, rank, ID and sub-taxa.
Definition: taxon.hpp:76
std::ostream & operator<<(std::ostream &out, Taxonomy const &tax)
Print the contents of a Taxonomy, i.e., all nested taxa, up to a limit of 10.
void sort_by_name(Taxonomy &tax, bool recursive, bool case_sensitive)
Sort the Taxa of a Taxonomy by their name.
void remove_taxa_at_level(Taxonomy &tax, size_t level)
Remove all Taxa at a given level of depth in the Taxonomy hierarchy, and all their children...
Provides some commonly used string utility functions.
Provides easy and fast logging functionality.
size_t size() const
Return the number of immediate child Taxa.
Definition: taxonomy.cpp:84
size_t total_taxa_count(Taxonomy const &tax)
Return the total number of taxa contained in the Taxomony, i.e., the number of (non-unique) names of ...
Taxon const * parent() const
Return a pointer to the parent of this taxon, or a nullptr if this is the top level taxon...
Definition: taxon.cpp:173
void clear_children()
Remove all children.
Definition: taxonomy.cpp:224
size_t taxa_count_at_level(Taxonomy const &tax, size_t level)
Count the number of Taxa at a certain level of depth in the Taxonomy.
#define LOG_INFO
Log an info message. See genesis::utils::LoggingLevel.
Definition: logging.hpp:98