A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sequence/functions/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
40 
41 #include <algorithm>
42 #include <array>
43 #include <assert.h>
44 #include <cctype>
45 #include <numeric>
46 #include <ostream>
47 #include <sstream>
48 #include <stdexcept>
49 #include <string>
50 #include <unordered_map>
51 #include <unordered_set>
52 
53 namespace genesis {
54 namespace sequence {
55 
56 // =================================================================================================
57 // Characteristics
58 // =================================================================================================
59 
61  Sequence const& seq,
62  std::string const& chars
63 ) {
64  // Init lookup array.
65  auto lookup = utils::CharLookup<bool>( false );
66  lookup.set_selection_upper_lower( chars, true );
67 
68  return find_sites( seq, lookup );
69 }
70 
72  Sequence const& seq,
73  utils::CharLookup<bool> const& chars
74 ) {
75  auto result = utils::Bitvector( seq.length(), false );
76  for( size_t i = 0; i < seq.length(); ++i ) {
77  result.set( i, chars[ seq[ i ] ] );
78  }
79  return result;
80 }
81 
82 utils::Bitvector gap_sites( Sequence const& seq, std::string const& gap_chars )
83 {
84  return find_sites( seq, gap_chars );
85 }
86 
87 utils::Bitvector gap_sites( SequenceSet const& set, std::string const& gap_chars )
88 {
89  // Edge case.
90  if( set.size() == 0 ) {
91  return utils::Bitvector();
92  }
93 
94  // Init result bitvector to all true. Then, for every site that is not a gap, reset to false.
95  auto result = utils::Bitvector( set[0].length(), true );
96 
97  // Init lookup array.
98  auto lookup = utils::CharLookup<bool>( false );
99  lookup.set_selection_upper_lower( gap_chars, true );
100 
101  // Process all sequences in the set.
102  for( auto const& seq : set ) {
103  if( seq.length() != result.size() ) {
104  throw std::runtime_error(
105  "Cannot calculate gap_sites() if SequenceSet is not an alignment."
106  );
107  }
108 
109  // Process the sites of the sequence. If it is not a gap, set it to false in the bitvector.
110  // This way, only sites that are all-gap will remain with value `true` in the end.
111  for( size_t i = 0; i < seq.length(); ++i ) {
112  if( ! lookup[ seq[ i ] ] ) {
113  result.set( i, false );
114  }
115  }
116  }
117 
118  return result;
119 }
120 
121 bool validate_chars( SequenceSet const& set, std::string const& chars )
122 {
123  // Init array to false, then set all necessary chars to true.
124  auto lookup = utils::CharLookup<bool>( false );
125  lookup.set_selection_upper_lower( chars, true );
126 
127  for( auto& s : set ) {
128  for( auto& c : s ) {
129  // get rid of this check and leave it to the parser/lexer/stream iterator
130  if( c < 0 ) {
131  return false;
132  }
133  assert( c >= 0 );
134  if( ! lookup[ c ] ) {
135  return false;
136  }
137  }
138  }
139  return true;
140 }
141 
142 // -------------------------------------------------------------------------
143 // Length and length checks
144 // -------------------------------------------------------------------------
145 
147 {
148  size_t max = 0;
149  for( auto const& seq : set ) {
150  max = std::max( max, seq.length() );
151  }
152  return max;
153 }
154 
155 size_t total_length( SequenceSet const& set )
156 {
157  return std::accumulate( set.begin(), set.end(), 0,
158  [] (size_t c, Sequence const& s) {
159  return c + s.length();
160  }
161  );
162 }
163 
164 bool is_alignment( SequenceSet const& set )
165 {
166  if( set.size() == 0 ) {
167  return true;
168  }
169 
170  size_t length = set[0].length();
171  for( auto& s : set ) {
172  if( s.length() != length ) {
173  return false;
174  }
175  }
176  return true;
177 }
178 
179 // =================================================================================================
180 // Modifiers
181 // =================================================================================================
182 
184 {
185  if( seq.length() != sites.size() ) {
186  throw std::runtime_error(
187  "Cannot remove sites from Sequence. "
188  "Given Bitvector has not the same size as the Sequence."
189  );
190  }
191 
192  auto const num_sites = sites.size() - sites.count();
193  std::string result;
194  result.reserve( num_sites );
195 
196  for( size_t i = 0; i < sites.size(); ++i ) {
197  if( ! sites[i] ) {
198  result += seq[i];
199  }
200  }
201 
202  seq.sites( result );
203 }
204 
206 {
207  for( auto const& seq : set ) {
208  if( seq.length() != sites.size() ) {
209  throw std::runtime_error(
210  "Cannot remove sites from SequenceSet. "
211  "Given Bitvector has not the same size as the Sequences."
212  );
213  }
214  }
215 
216  for( auto& seq : set ) {
217  remove_sites( seq, sites );
218  }
219 }
220 
221 void remove_gap_sites( SequenceSet& set, std::string const& gap_chars )
222 {
223  auto sites = gap_sites( set, gap_chars );
224  remove_sites( set, sites );
225 }
226 
227 void remove_characters( Sequence& seq, std::string const& search )
228 {
229  auto is_search_char = [&] ( char c ) {
230  return search.find( c ) != std::string::npos;
231  };
232 
233  auto& str = seq.sites();
234  str.erase( std::remove_if( str.begin(), str.end(), is_search_char ), str.end() );
235 }
236 
237 void remove_characters( SequenceSet& set, std::string const& search )
238 {
239  for( auto& sequence : set ) {
240  remove_characters( sequence, search );
241  }
242 }
243 
244 void remove_all_gaps( Sequence& seq, std::string const& gap_chars )
245 {
246  remove_characters( seq, gap_chars );
247 }
248 
249 void remove_all_gaps( SequenceSet& set, std::string const& gap_chars )
250 {
251  remove_characters( set, gap_chars );
252 }
253 
254 void replace_characters( Sequence& seq, std::string const& search, char replacement )
255 {
256  seq.sites() = utils::replace_all_chars( seq.sites(), search, replacement );
257 }
258 
259 void replace_characters( SequenceSet& set, std::string const& search, char replacement )
260 {
261  for( auto& sequence : set ) {
262  replace_characters( sequence, search, replacement );
263  }
264 }
265 
267 {
268  for( auto& c : seq.sites() ) {
269  if( c == 'U' ) {
270  c = 'T';
271  }
272  if( c == 'u' ) {
273  c = 't';
274  }
275  }
276 }
277 
279 {
280  for( auto& sequence : set ) {
281  replace_u_with_t( sequence );
282  }
283 }
284 
286 {
287  for( auto& c : seq.sites() ) {
288  if( c == 'T' ) {
289  c = 'U';
290  }
291  if( c == 't' ) {
292  c = 'u';
293  }
294  }
295 }
296 
298 {
299  for( auto& sequence : set ) {
300  replace_t_with_u( sequence );
301  }
302 }
303 
305  SequenceSet& set,
307  std::string const& counter_prefix
308 ) {
309  // Helper to keep track of which sequences (at position i in the set) occured how often.
310  struct Duplicate
311  {
312  size_t index;
313  size_t count;
314  };
315 
316  // Find duplicates and count their occurences.
317  // TODO this is a very memory intense step. find an algo that does not need to copy all sites...
318  std::unordered_map< std::string, Duplicate > dup_map;
319  size_t i = 0;
320  while( i < set.size() ) {
321  auto& seq = set[i];
322 
323  if( dup_map.count( seq.sites() ) == 0 ) {
324 
325  // If it is a new sequence, init the map entry and move to the next sequence.
326  dup_map[ seq.sites() ].index = i;
327  dup_map[ seq.sites() ].count = 1;
328  ++i;
329 
330  } else {
331 
332  // If we already saw that sequence, increment its counter, and remove the sequence
333  // from the set. Do not increment i - we deleted a sequence, so staying at the
334  // position automatically "moves" to the next one.
335  ++dup_map[ seq.sites() ].count;
336  set.remove(i);
337  }
338  }
339 
340  // We do not need to relabel sequences.
341  if( count_policy == MergeDuplicateSequencesCountPolicy::kDiscard ) {
342  return;
343  }
344 
345  // Relabel using the counts.
346  for( size_t i = 0; i < set.size(); ++i ) {
347  auto& seq = set[i];
348 
349  // The sequence needs to be in the map, as we added it in the previous step.
350  // It also needs to have the same index, as we never changed that.
351  assert( dup_map.count(seq.sites()) > 0 );
352  assert( dup_map[ seq.sites() ].index == i );
353 
354  // Append the count to the label.
355  auto count = dup_map[ seq.sites() ].count;
357  auto new_label = seq.label() + counter_prefix + std::to_string(count);
358  seq.label( new_label );
359 
360  } else {
361 
362  // We already skipped the discard case, so this here should not happen.
363  assert( false );
364  }
365  }
366 }
367 
368 // =================================================================================================
369 // Normalization
370 // =================================================================================================
371 
372 void normalize_nucleic_acid_codes( Sequence& sequence, bool accept_degenerated )
373 {
374  for( auto& c : sequence.sites() ) {
375  c = normalize_nucleic_acid_code( c, accept_degenerated );
376  }
377 }
378 
379 void normalize_nucleic_acid_codes( SequenceSet& sequence_set, bool accept_degenerated )
380 {
381  for( auto& sequence : sequence_set ) {
382  normalize_nucleic_acid_codes( sequence, accept_degenerated );
383  }
384 }
385 
386 void normalize_amino_acid_codes( Sequence& sequence, bool accept_degenerated )
387 {
388  for( auto& c : sequence.sites() ) {
389  c = normalize_amino_acid_code( c, accept_degenerated );
390  }
391 }
392 
393 void normalize_amino_acid_codes( SequenceSet& sequence_set, bool accept_degenerated )
394 {
395  for( auto& sequence : sequence_set ) {
396  normalize_amino_acid_codes( sequence, accept_degenerated );
397  }
398 }
399 
400 // =================================================================================================
401 // Filters
402 // =================================================================================================
403 
404 void filter_min_sequence_length( SequenceSet& set, size_t min_length )
405 {
406  size_t index = 0;
407  while( index < set.size() ) {
408  if( set.at(index).length() < min_length ) {
409  set.remove(index);
410  } else {
411  ++index;
412  }
413  }
414 }
415 
416 void filter_max_sequence_length( SequenceSet& set, size_t max_length )
417 {
418  size_t index = 0;
419  while( index < set.size() ) {
420  if( set.at(index).length() > max_length ) {
421  set.remove(index);
422  } else {
423  ++index;
424  }
425  }
426 }
427 
428 void filter_min_max_sequence_length( SequenceSet& set, size_t min_length, size_t max_length )
429 {
430  size_t index = 0;
431  while( index < set.size() ) {
432  auto len = set.at(index).length();
433  if( len < min_length || len > max_length ) {
434  set.remove(index);
435  } else {
436  ++index;
437  }
438  }
439 }
440 
441 // =================================================================================================
442 // Print and Output
443 // =================================================================================================
444 
445 // -------------------------------------------------------------------------
446 // Ostream
447 // -------------------------------------------------------------------------
448 
449 std::ostream& operator << ( std::ostream& out, Sequence const& seq )
450 {
451  auto printer = PrinterSimple();
452  printer.length_limit( 100 );
453 
454  printer.print( out, seq );
455  return out;
456 }
457 
458 std::ostream& operator << ( std::ostream& out, SequenceSet const& set )
459 {
460  auto printer = PrinterSimple();
461  printer.length_limit( 100 );
462  printer.sequence_limit( 10 );
463 
464  printer.print( out, set );
465  return out;
466 }
467 
468 } // namespace sequence
469 } // namespace genesis
reference at(size_t index)
void remove_characters(Sequence &seq, std::string const &search)
Remove all of the characters in search from the sites of the Sequence.
size_t size() const
Return the size (number of bits) of this Bitvector.
Definition: bitvector.hpp:200
size_t count() const
Count the number of set bits in the Bitvector, that is, its Hamming weight.
Definition: bitvector.cpp:174
The counts are appended to the sequence label, separated by the counter_prefix.
utils::Bitvector gap_sites(Sequence const &seq, std::string const &gap_chars)
Return a Bitvector that is true where the Sequence has a gap and false where not. ...
size_t length() const
Return the length (number of sites) of this sequence.
Definition: sequence.cpp:92
std::string const & sites() const
Definition: sequence.cpp:58
void filter_min_sequence_length(SequenceSet &set, size_t min_length)
Remove all Sequences from the SequenceSet whose length is below the given min_length.
void normalize_amino_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_amino_acid_code() for each site of the Sequence.
void set(size_t index)
Set the value of a single bit to true, with boundary check.
Definition: bitvector.hpp:139
void remove_gap_sites(SequenceSet &set, std::string const &gap_chars)
Remove all sites that only contain gap characters from the SequenceSet.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:373
void remove_sites(Sequence &seq, utils::Bitvector sites)
Remove all sites from a Sequence where the given Bitvector is true, and keep all others.
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
Definition: string.cpp:273
bool validate_chars(SequenceSet const &set, std::string const &chars)
Returns true iff all Sequences only consist of the given chars.
void filter_max_sequence_length(SequenceSet &set, size_t max_length)
Remove all Sequences from the SequenceSet whose length is above the given max_length.
utils::Bitvector find_sites(Sequence const &seq, std::string const &chars)
Find sites by character and mark them in a Bitvector.
void replace_t_with_u(Sequence &seq)
Replace all occurrences of T by U in the sites of the Sequence.
void replace_u_with_t(Sequence &seq)
Replace all occurrences of U by T in the sites of the Sequence.
void filter_min_max_sequence_length(SequenceSet &set, size_t min_length, size_t max_length)
Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length...
std::ostream & operator<<(std::ostream &out, Sequence const &seq)
Print a Sequence to an ostream in the form "label: sites".
void merge_duplicate_sequences(SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy, std::string const &counter_prefix)
Merge all Sequences in a SequenceSet that have identical sites.
MergeDuplicateSequencesCountPolicy
Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences...
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
char normalize_amino_acid_code(char code, bool accept_degenerated)
Normalize an amino acid code.
Definition: codes.cpp:414
Provides some commonly used string utility functions.
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
void remove_all_gaps(Sequence &seq, std::string const &gap_chars)
Remove all gap characters from the sites of the Sequence.
Store a set of Sequences.
Provides easy and fast logging functionality.
bool is_alignment(SequenceSet const &set)
Return true iff all Sequences in the SequenceSet have the same length.
void set_selection_upper_lower(std::string const &chars, T value)
Set the lookup status for both the upper and lower case of all chars that are contained in a given st...
size_t longest_sequence_length(SequenceSet const &set)
Return the length of the longest Sequence in the SequenceSet.
char normalize_nucleic_acid_code(char code, bool accept_degenerated)
Normalize a nucleic acide code.
Definition: codes.cpp:355
void normalize_nucleic_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_nucleic_acid_code() for each site of the Sequence.
Simple printer class for Sequences and SequenceSets.
Definition: simple.hpp:61
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
void replace_characters(Sequence &seq, std::string const &search, char replacement)
Replace all occurences of the chars in search by the replace char, for all sites in the given Sequenc...