A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
labels.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
40 
41 #include <algorithm>
42 #include <cctype>
43 // #include <regex>
44 
45 namespace genesis {
46 namespace sequence {
47 
48 // =================================================================================================
49 // General
50 // =================================================================================================
51 
52 Sequence const* find_sequence( SequenceSet const& set, std::string const& label )
53 {
54  for (Sequence const& s : set) {
55  if (s.label() == label) {
56  return &s;
57  }
58  }
59  return nullptr;
60 }
61 
62 std::unordered_set<std::string> labels( SequenceSet const& set )
63 {
64  std::unordered_set<std::string> result;
65  for( auto const& seq : set ) {
66  result.insert( seq.label() );
67  }
68  return result;
69 }
70 std::pair<std::string, size_t> guess_sequence_abundance( Sequence const& sequence )
71 {
72  return guess_sequence_abundance( sequence.label() );
73 }
74 
75 std::pair<std::string, size_t> guess_sequence_abundance( std::string const& label )
76 {
77  std::string res_name = label;
78  size_t res_abun = 1;
79 
80  // We only look for a simple number, no sign oder decimal points etc
81  auto is_digits = []( std::string const& s )
82  {
83  return s.find_first_not_of( "0123456789" ) == std::string::npos;
84  };
85 
86  // Try to find "size=123"
87  auto spos = label.find( "size=" );
88  if( spos != std::string::npos && spos + 5 < label.size() && isdigit( label[ spos + 5 ]) ) {
89 
90  // Parse the substring as far as possible, that is, get all digits.
91  auto const sub = label.substr( spos + 5 );
92  try{
93  res_abun = std::stoull( sub );
94 
95  // If the number parsing above succeeds, also change the name/label.
96  // Here, we need to take care of a semicolon (or other non-alpha char)
97  // that might appear in front of the "size=" part. If there is one, ignore it.
98  if( spos > 0 && ispunct( label[ spos - 1 ] )) {
99  --spos;
100  }
101  res_name = label.substr( 0, spos );
102  } catch( ... ){
103  res_name = label;
104  res_abun = 1;
105  }
106  }
107 
108  // Try to find "_123" at the end
109  auto const upos = label.find_last_of( "_" );
110  if( upos != std::string::npos && upos + 1 < label.size() && isdigit( label[ upos + 1 ]) ) {
111 
112  // The rest of the label needs to be a number.
113  auto const sub = label.substr( upos + 1 );
114  if( is_digits( sub ) ) {
115  res_name = label.substr( 0, upos );
116  res_abun = std::stoull( sub );
117  }
118  }
119 
120  return { res_name, res_abun };
121 
122  // Slow regex version
123  // Prepare static regex (no need to re-compile it on every function call).
124  // Matches either ";size=123;" or "_123"
125  // static const std::string expr = "(?:[;]?size=([0-9]+)[;]?)|(?:_([0-9]+)$)";
126  // static std::regex pattern( expr );
127  //
128  // // Run the expression.
129  // std::smatch matches;
130  // if( std::regex_search( label, matches, pattern )) {
131  // size_t res;
132  // std::string const num = ( matches[1].str().empty() ? matches[2].str() : matches[1].str() );
133  // sscanf( num.c_str(), "%zu", &res );
134  // return res;
135  // } else {
136  // return 1;
137  // }
138 }
139 
140 // =================================================================================================
141 // Uniqueness
142 // =================================================================================================
143 
144 bool has_unique_labels( SequenceSet const& set, bool case_sensitive )
145 {
146  std::unordered_set< std::string > labels;
147  std::string label;
148 
149  for( auto const& seq : set ) {
150  if( case_sensitive ) {
151  label = seq.label();
152  } else {
153  label = utils::to_lower( seq.label() );
154  }
155 
156  if( labels.count( label ) > 0 ) {
157  return false;
158  } else {
159  labels.insert( label );
160  }
161  }
162  return true;
163 }
164 
166 {
167  auto const digest = utils::hash_from_string_hex( seq.sites(), hash_function );
168  seq.label( digest );
169 }
170 
172 {
173  for( auto& seq : set ) {
174  relabel_with_hash( seq, hash_function );
175  }
176 }
177 
178 // =================================================================================================
179 // Validity
180 // =================================================================================================
181 
182 bool is_valid_label( std::string const& label )
183 {
184  std::string invalid_chars = ":,();[]'";
185  for( auto c : label ) {
186  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
187  return false;
188  }
189  }
190  return true;
191 }
192 
193 bool has_valid_label( Sequence const& seq )
194 {
195  return is_valid_label( seq.label() );
196 }
197 
198 bool has_valid_labels( SequenceSet const& set )
199 {
200  for( auto const& seq : set ) {
201  if( ! has_valid_label( seq )) {
202  return false;
203  }
204  }
205  return true;
206 }
207 
208 std::string sanitize_label( std::string const& label )
209 {
210  std::string result;
211  result.reserve( label.size() );
212 
213  std::string const invalid_chars = ":,();[]'";
214  for( auto c : label ) {
215  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
216  result += "_";
217  } else {
218  result += c;
219  }
220  }
221  return result;
222 }
223 
225 {
226  seq.label( sanitize_label( seq.label() ));
227 }
228 
230 {
231  for( auto& seq : set ) {
232  sanitize_label( seq );
233  }
234 }
235 
236 // =================================================================================================
237 // Modifiers
238 // =================================================================================================
239 
241  SequenceSet& set,
242  std::unordered_set<std::string> const& labels,
243  bool invert
244 ) {
245  auto new_last = std::remove_if(
246  set.begin(),
247  set.end(),
248  [&] ( Sequence const& seq ) {
249  return ( !invert && labels.count( seq.label() ) > 0 ) ||
250  ( invert && labels.count( seq.label() ) == 0 );
251  }
252  );
253  set.remove( new_last , set.end() );
254 }
255 
256 } // namespace sequence
257 } // namespace genesis
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
Definition: labels.cpp:70
std::string const & sites() const
Definition: sequence.cpp:58
void filter_by_label_list(SequenceSet &set, std::unordered_set< std::string > const &labels, bool invert)
Remove all those Sequences from a SequenceSet whose labels are in the given list. ...
Definition: labels.cpp:240
bool has_unique_labels(SequenceSet const &set, bool case_sensitive)
Return true iff all labels of the Sequences in the SequenceSet are unique.
Definition: labels.cpp:144
std::string to_lower(std::string const &str)
Return an all-lowercase copy of the given string, locale-aware.
Definition: string.hpp:206
std::string const & label() const
Definition: sequence.cpp:44
bool has_valid_labels(SequenceSet const &set)
Check whether all Sequences in a SequenceSet have valid labels.
Definition: labels.cpp:198
void sanitize_labels(SequenceSet &set)
Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with unde...
Definition: labels.cpp:229
Provides some commonly used string utility functions.
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
bool is_valid_label(std::string const &label)
Check whether a given string is a valid label for a Sequence.
Definition: labels.cpp:182
Store a set of Sequences.
std::string sanitize_label(std::string const &label)
Sanitize a label by replacing all invalid characters with underscores.
Definition: labels.cpp:208
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:62
std::string hash_from_string_hex(std::string const &input, HashingFunctions hash_fct)
Calcualte the hash of a string, using a given hashing function, and return its hex representation as ...
Definition: hashing.cpp:64
void relabel_with_hash(Sequence &seq, utils::HashingFunctions hash_function)
Relabel the Sequence using the hash digest of its sites.
Definition: labels.cpp:165
HashingFunctions
List of the currently implemented hashing functions.
Definition: hashing.hpp:53
bool has_valid_label(Sequence const &seq)
Check whether a Sequence has a valid label.
Definition: labels.cpp:193
Sequence const * find_sequence(SequenceSet const &set, std::string const &label)
Return a pointer to a Sequence with a specific label, or nullptr iff not found.
Definition: labels.cpp:52