A library for working with phylogenetic and population genetic data.
v0.27.0
labels.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <cctype>
44 // #include <regex>
45 #include <stdexcept>
46 
47 namespace genesis {
48 namespace sequence {
49 
50 // =================================================================================================
51 // General
52 // =================================================================================================
53 
54 Sequence const* find_sequence( SequenceSet const& set, std::string const& label )
55 {
56  for (Sequence const& s : set) {
57  if (s.label() == label) {
58  return &s;
59  }
60  }
61  return nullptr;
62 }
63 
64 std::unordered_set<std::string> labels( SequenceSet const& set )
65 {
66  std::unordered_set<std::string> result;
67  for( auto const& seq : set ) {
68  result.insert( seq.label() );
69  }
70  return result;
71 }
72 std::pair<std::string, size_t> guess_sequence_abundance( Sequence const& sequence )
73 {
74  return guess_sequence_abundance( sequence.label() );
75 }
76 
77 std::pair<std::string, size_t> guess_sequence_abundance( std::string const& label )
78 {
79  std::string res_name = label;
80  size_t res_abun = 1;
81 
82  // We only look for a simple number, no sign oder decimal points etc
83  auto is_digits_ = []( std::string const& s )
84  {
85  return s.find_first_not_of( "0123456789" ) == std::string::npos;
86  };
87 
88  // Try to find ";size=123;", using label attributes.
89  try{
90  auto const la = label_attributes( label );
91  res_name = la.label;
92  if( la.attributes.count( "size" ) > 0 && is_digits_( la.attributes.at( "size" ))) {
93  res_abun = std::stoull( la.attributes.at( "size" ));
94  }
95  } catch( ... ) {
96  res_name = label;
97  res_abun = 1;
98  }
99 
100  // Try to find "_123" at the end.
101  auto const upos = label.find_last_of( "_" );
102  if( upos != std::string::npos && upos + 1 < label.size() && ::isdigit( label[ upos + 1 ]) ) {
103 
104  // The rest of the label needs to be a number.
105  auto const sub = label.substr( upos + 1 );
106  if( is_digits_( sub ) ) {
107  res_name = label.substr( 0, upos );
108  res_abun = std::stoull( sub );
109  }
110  }
111 
112  return { res_name, res_abun };
113 
114  // Try to find "size=123".
115  // This is the old version that directly parses the label.
116  // auto spos = label.find( "size=" );
117  // if( spos != std::string::npos && spos + 5 < label.size() && ::isdigit( label[ spos + 5 ]) ) {
118  //
119  // // Parse the substring as far as possible, that is, get all digits.
120  // auto const sub = label.substr( spos + 5 );
121  // try{
122  // res_abun = std::stoull( sub );
123  //
124  // // If the number parsing above succeeds, also change the name/label.
125  // // Here, we need to take care of a semicolon (or other non-alpha char)
126  // // that might appear in front of the "size=" part. If there is one, ignore it.
127  // if( spos > 0 && ::ispunct( label[ spos - 1 ] )) {
128  // --spos;
129  // }
130  // res_name = label.substr( 0, spos );
131  // } catch( ... ){
132  // res_name = label;
133  // res_abun = 1;
134  // }
135  // }
136 
137  // Slow regex version
138  // Prepare static regex (no need to re-compile it on every function call).
139  // Matches either ";size=123;" or "_123"
140  // static const std::string expr = "(?:[;]?size=([0-9]+)[;]?)|(?:_([0-9]+)$)";
141  // static std::regex pattern( expr );
142  //
143  // // Run the expression.
144  // std::smatch matches;
145  // if( std::regex_search( label, matches, pattern )) {
146  // size_t res;
147  // std::string const num = ( matches[1].str().empty() ? matches[2].str() : matches[1].str() );
148  // sscanf( num.c_str(), "%zu", &res );
149  // return res;
150  // } else {
151  // return 1;
152  // }
153 }
154 
156 {
157  return label_attributes( sequence.label() );
158 }
159 
160 LabelAttributes label_attributes( std::string const& label )
161 {
162  // Set the label to the first part (before the first semicolon).
163  // This is always correct, even if there are no semicola.
164  LabelAttributes result;
165  auto const attribs = utils::split( label, ";" );
166  assert( attribs.size() > 0 );
167  result.label = attribs.front();
168 
169  // Set the other parts. We here require that the attribs follow the needed structure.
170  for( size_t i = 1; i < attribs.size(); ++i ) {
171  auto const ap = utils::split( attribs[i], "=" );
172  if( ap.size() != 2 ) {
173  throw std::runtime_error( "Invalid Sequence label for extracting label attributes." );
174  }
175  result.attributes[ ap[0] ] = ap[1];
176  }
177 
178  return result;
179 }
180 
181 // =================================================================================================
182 // Uniqueness
183 // =================================================================================================
184 
185 bool has_unique_labels( SequenceSet const& set, bool case_sensitive )
186 {
187  std::unordered_set< std::string > label_set;
188  std::string label;
189 
190  for( auto const& seq : set ) {
191  if( case_sensitive ) {
192  label = seq.label();
193  } else {
194  label = utils::to_lower( seq.label() );
195  }
196 
197  if( label_set.count( label ) > 0 ) {
198  return false;
199  } else {
200  label_set.insert( label );
201  }
202  }
203  return true;
204 }
205 
207 {
208  auto const digest = utils::hash_hex( utils::from_string( seq.sites() ), hash_function );
209  seq.label( digest );
210 }
211 
213 {
214  for( auto& seq : set ) {
215  relabel_with_hash( seq, hash_function );
216  }
217 }
218 
219 // =================================================================================================
220 // Validity
221 // =================================================================================================
222 
223 bool is_valid_label( std::string const& label )
224 {
225  std::string invalid_chars = ":,();[]'";
226  for( auto c : label ) {
227  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
228  return false;
229  }
230  }
231  return true;
232 }
233 
234 bool has_valid_label( Sequence const& seq )
235 {
236  return is_valid_label( seq.label() );
237 }
238 
239 bool has_valid_labels( SequenceSet const& set )
240 {
241  for( auto const& seq : set ) {
242  if( ! has_valid_label( seq )) {
243  return false;
244  }
245  }
246  return true;
247 }
248 
249 std::string sanitize_label( std::string const& label )
250 {
251  std::string result;
252  result.reserve( label.size() );
253 
254  std::string const invalid_chars = ":,();[]'";
255  for( auto c : label ) {
256  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
257  result += "_";
258  } else {
259  result += c;
260  }
261  }
262  return result;
263 }
264 
266 {
267  seq.label( sanitize_label( seq.label() ));
268 }
269 
271 {
272  for( auto& seq : set ) {
273  sanitize_label( seq );
274  }
275 }
276 
277 // =================================================================================================
278 // Modifiers
279 // =================================================================================================
280 
282  SequenceSet& set,
283  std::unordered_set<std::string> const& labels,
284  bool invert
285 ) {
286  auto new_last = std::remove_if(
287  set.begin(),
288  set.end(),
289  [&] ( Sequence const& seq ) {
290  return ( !invert && labels.count( seq.label() ) > 0 ) ||
291  ( invert && labels.count( seq.label() ) == 0 );
292  }
293  );
294  set.remove( new_last , set.end() );
295 }
296 
297 } // namespace sequence
298 } // namespace genesis
genesis::sequence::is_valid_label
bool is_valid_label(std::string const &label)
Check whether a given string is a valid label for a Sequence.
Definition: labels.cpp:223
genesis::sequence::find_sequence
Sequence const * find_sequence(SequenceSet const &set, std::string const &label)
Return a pointer to a Sequence with a specific label, or nullptr iff not found.
Definition: labels.cpp:54
genesis::sequence::filter_by_label_list
void filter_by_label_list(SequenceSet &set, std::unordered_set< std::string > const &labels, bool invert)
Remove all those Sequences from a SequenceSet whose labels are in the given list.
Definition: labels.cpp:281
genesis::utils::HashingFunctions
HashingFunctions
List of the currently implemented hashing functions.
Definition: utils/tools/hash/functions.hpp:53
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::LabelAttributes
Definition: labels.hpp:55
genesis::utils::hash_hex
std::string hash_hex(std::shared_ptr< BaseInputSource > source, HashingFunctions hash_fct)
Calculate the hash of an input source, using a given hashing function, and return its hex representat...
Definition: utils/tools/hash/functions.cpp:47
labels.hpp
genesis::sequence::SequenceSet::begin
iterator begin()
Definition: sequence_set.cpp:139
genesis::sequence::has_valid_labels
bool has_valid_labels(SequenceSet const &set)
Check whether all Sequences in a SequenceSet have valid labels.
Definition: labels.cpp:239
sequence_set.hpp
genesis::utils::from_string
std::shared_ptr< BaseInputSource > from_string(std::string const &input_string)
Obtain an input source for reading from a string.
Definition: input_source.hpp:133
string.hpp
Provides some commonly used string utility functions.
sha1.hpp
genesis::sequence::sanitize_label
std::string sanitize_label(std::string const &label)
Sanitize a label by replacing all invalid characters with underscores.
Definition: labels.cpp:249
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::sequence::labels
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:64
genesis::sequence::SequenceSet::remove
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
Definition: sequence_set.cpp:98
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:59
genesis::sequence::relabel_with_hash
void relabel_with_hash(Sequence &seq, utils::HashingFunctions hash_function)
Relabel the Sequence using the hash digest of its sites.
Definition: labels.cpp:206
md5.hpp
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::sequence::LabelAttributes::label
std::string label
Definition: labels.hpp:57
genesis::sequence::has_unique_labels
bool has_unique_labels(SequenceSet const &set, bool case_sensitive)
Return true iff all labels of the Sequences in the SequenceSet are unique.
Definition: labels.cpp:185
genesis::sequence::sanitize_labels
void sanitize_labels(SequenceSet &set)
Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with unde...
Definition: labels.cpp:270
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::sequence::guess_sequence_abundance
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
Definition: labels.cpp:72
genesis::sequence::LabelAttributes::attributes
std::unordered_map< std::string, std::string > attributes
Definition: labels.hpp:58
genesis::sequence::SequenceSet::end
iterator end()
Definition: sequence_set.cpp:144
genesis::sequence::label_attributes
LabelAttributes label_attributes(Sequence const &sequence)
Get the attributes list (semicolons-separated) from a Sequence.
Definition: labels.cpp:155
sequence.hpp
genesis::sequence::has_valid_label
bool has_valid_label(Sequence const &seq)
Check whether a Sequence has a valid label.
Definition: labels.cpp:234
genesis::utils::split
std::vector< std::string > split(std::string const &str, std::string const &delimiters, const bool trim_empty)
Spilt a string into parts, given a delimiters set of chars.
Definition: string.cpp:386
sha256.hpp