A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
labels.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
40 
41 #include <algorithm>
42 #include <cctype>
43 // #include <regex>
44 
45 namespace genesis {
46 namespace sequence {
47 
48 // =================================================================================================
49 // General
50 // =================================================================================================
51 
52 Sequence const* find_sequence( SequenceSet const& set, std::string const& label )
53 {
54  for (Sequence const& s : set) {
55  if (s.label() == label) {
56  return &s;
57  }
58  }
59  return nullptr;
60 }
61 
62 std::unordered_set<std::string> labels( SequenceSet const& set )
63 {
64  std::unordered_set<std::string> result;
65  for( auto const& seq : set ) {
66  result.insert( seq.label() );
67  }
68  return result;
69 }
70 size_t guess_sequence_abundance( Sequence const& sequence )
71 {
72  return guess_sequence_abundance( sequence.label() );
73 }
74 
75 size_t guess_sequence_abundance( std::string const& label )
76 {
77  size_t result = 1;
78 
79  // We only look for a simple number, no sign oder decimal points etc
80  auto is_digits = []( std::string const& s )
81  {
82  return s.find_first_not_of( "0123456789" ) == std::string::npos;
83  };
84 
85  // Try to find "size=123"
86  auto const spos = label.find( "size=" );
87  if( spos != std::string::npos && spos + 5 < label.size() && isdigit( label[ spos + 5 ]) ) {
88 
89  // Parse the substring as far as possible, that is, get all digits.
90  auto const sub = label.substr( spos + 5 );
91  try{
92  result = std::stoull( sub );
93  } catch( ... ){
94  result = 1;
95  }
96  }
97 
98  // Try to find "_123" at the end
99  auto const upos = label.find_last_of( "_" );
100  if( upos != std::string::npos && upos + 1 < label.size() && isdigit( label[ upos + 1 ]) ) {
101 
102  // The rest of the label needs to be a number.
103  auto const sub = label.substr( upos + 1 );
104  if( is_digits( sub ) ) {
105  result = std::stoull( sub );
106  }
107  }
108 
109  return result;
110 
111  // Slow regex version
112  // Prepare static regex (no need to re-compile it on every function call).
113  // Matches either ";size=123;" or "_123"
114  // static const std::string expr = "(?:[;]?size=([0-9]+)[;]?)|(?:_([0-9]+)$)";
115  // static std::regex pattern( expr );
116  //
117  // // Run the expression.
118  // std::smatch matches;
119  // if( std::regex_search( label, matches, pattern )) {
120  // size_t res;
121  // std::string const num = ( matches[1].str().empty() ? matches[2].str() : matches[1].str() );
122  // sscanf( num.c_str(), "%zu", &res );
123  // return res;
124  // } else {
125  // return 1;
126  // }
127 }
128 
129 // =================================================================================================
130 // Uniqueness
131 // =================================================================================================
132 
133 bool has_unique_labels( SequenceSet const& set, bool case_sensitive )
134 {
135  std::unordered_set< std::string > labels;
136  std::string label;
137 
138  for( auto const& seq : set ) {
139  if( case_sensitive ) {
140  label = seq.label();
141  } else {
142  label = utils::to_lower( seq.label() );
143  }
144 
145  if( labels.count( label ) > 0 ) {
146  return false;
147  } else {
148  labels.insert( label );
149  }
150  }
151  return true;
152 }
153 
154 void relabel_sha1( Sequence& seq )
155 {
156  auto digest = utils::SHA1::from_string_hex( seq.sites() );
157  seq.label( digest );
158 }
159 
161 {
162  for( auto& seq : set ) {
163  relabel_sha1( seq );
164  }
165 }
166 
168 {
169  auto digest = utils::SHA256::from_string_hex( seq.sites() );
170  seq.label( digest );
171 }
172 
174 {
175  for( auto& seq : set ) {
176  relabel_sha256( seq );
177  }
178 }
179 
180 void relabel_md5( Sequence& seq )
181 {
182  auto digest = utils::MD5::from_string_hex( seq.sites() );
183  seq.label( digest );
184 }
185 
187 {
188  for( auto& seq : set ) {
189  relabel_sha1( seq );
190  }
191 }
192 
193 // =================================================================================================
194 // Validity
195 // =================================================================================================
196 
197 bool is_valid_label( std::string const& label )
198 {
199  std::string invalid_chars = ":,();[]'";
200  for( auto c : label ) {
201  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
202  return false;
203  }
204  }
205  return true;
206 }
207 
208 bool has_valid_label( Sequence const& seq )
209 {
210  return is_valid_label( seq.label() );
211 }
212 
213 bool has_valid_labels( SequenceSet const& set )
214 {
215  for( auto const& seq : set ) {
216  if( ! has_valid_label( seq )) {
217  return false;
218  }
219  }
220  return true;
221 }
222 
223 std::string sanitize_label( std::string const& label )
224 {
225  std::string result;
226  result.reserve( label.size() );
227 
228  std::string const invalid_chars = ":,();[]'";
229  for( auto c : label ) {
230  if( ! isgraph(c) || invalid_chars.find( c ) != std::string::npos ) {
231  result += "_";
232  } else {
233  result += c;
234  }
235  }
236  return result;
237 }
238 
240 {
241  seq.label( sanitize_label( seq.label() ));
242 }
243 
245 {
246  for( auto& seq : set ) {
247  sanitize_label( seq );
248  }
249 }
250 
251 // =================================================================================================
252 // Modifiers
253 // =================================================================================================
254 
256  SequenceSet& set,
257  std::unordered_set<std::string> const& labels,
258  bool invert
259 ) {
260  auto new_last = std::remove_if(
261  set.begin(),
262  set.end(),
263  [&] ( Sequence const& seq ) {
264  return ( !invert && labels.count( seq.label() ) > 0 ) ||
265  ( invert && labels.count( seq.label() ) == 0 );
266  }
267  );
268  set.remove( new_last , set.end() );
269 }
270 
271 } // namespace sequence
272 } // namespace genesis
void relabel_sha256(Sequence &seq)
Relabel the Sequence using the SHA256 hash digest of its sites.
Definition: labels.cpp:167
void relabel_sha1(Sequence &seq)
Relabel the Sequence using the SHA1 hash digest of its sites.
Definition: labels.cpp:154
size_t guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
Definition: labels.cpp:70
std::string const & sites() const
Definition: sequence.cpp:58
void filter_by_label_list(SequenceSet &set, std::unordered_set< std::string > const &labels, bool invert)
Remove all those Sequences from a SequenceSet whose labels are in the given list. ...
Definition: labels.cpp:255
bool has_unique_labels(SequenceSet const &set, bool case_sensitive)
Return true iff all labels of the Sequences in the SequenceSet are unique.
Definition: labels.cpp:133
std::string to_lower(std::string const &str)
Return an all-lowercase copy of the given string, locale-aware.
Definition: string.hpp:198
void relabel_md5(Sequence &seq)
Relabel the Sequence using the MD5 hash digest of its sites.
Definition: labels.cpp:180
std::string const & label() const
Definition: sequence.cpp:44
bool has_valid_labels(SequenceSet const &set)
Check whether all Sequences in a SequenceSet have valid labels.
Definition: labels.cpp:213
void sanitize_labels(SequenceSet &set)
Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with unde...
Definition: labels.cpp:244
static std::string from_string_hex(std::string const &input)
Calculate the checksum for the content of a string.
Definition: sha1.cpp:161
Provides some commonly used string utility functions.
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
bool is_valid_label(std::string const &label)
Check whether a given string is a valid label for a Sequence.
Definition: labels.cpp:197
Store a set of Sequences.
std::string sanitize_label(std::string const &label)
Sanitize a label by replacing all invalid characters with underscores.
Definition: labels.cpp:223
std::unordered_set< std::string > labels(SequenceSet const &set)
Return a set of all labels of the SequenceSet.
Definition: labels.cpp:62
static std::string from_string_hex(std::string const &input)
Calculate the checksum for the content of a string.
Definition: sha256.cpp:216
bool has_valid_label(Sequence const &seq)
Check whether a Sequence has a valid label.
Definition: labels.cpp:208
static std::string from_string_hex(std::string const &input)
Calculate the checksum for the content of a string.
Definition: md5.cpp:196
Sequence const * find_sequence(SequenceSet const &set, std::string const &label)
Return a pointer to a Sequence with a specific label, or nullptr iff not found.
Definition: labels.cpp:52