A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sequence/functions/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
41 
42 #include <algorithm>
43 #include <array>
44 #include <assert.h>
45 #include <cctype>
46 #include <numeric>
47 #include <ostream>
48 #include <sstream>
49 #include <stdexcept>
50 #include <string>
51 #include <unordered_map>
52 #include <unordered_set>
53 
54 namespace genesis {
55 namespace sequence {
56 
57 // =================================================================================================
58 // Characteristics
59 // =================================================================================================
60 
61 utils::Bitvector gap_sites( Sequence const& seq, std::string const& gap_chars )
62 {
63  auto result = utils::Bitvector( seq.length() );
64 
65  // Init lookup array.
66  auto lookup = utils::CharLookup<bool>( false );
67  lookup.set_selection_upper_lower( gap_chars, true );
68 
69  for( size_t i = 0; i < seq.length(); ++i ) {
70  result.set( i, lookup[ seq[ i ] ] );
71  }
72  return result;
73 }
74 
75 utils::Bitvector gap_sites( SequenceSet const& set, std::string const& gap_chars )
76 {
77  // Edge case.
78  if( set.size() == 0 ) {
79  return utils::Bitvector();
80  }
81 
82  // Init result bitvector to all true. Then, for every site that is not a gap, reset to false.
83  auto result = utils::Bitvector( set[0].length(), true );
84 
85  // Init lookup array.
86  auto lookup = utils::CharLookup<bool>( false );
87  lookup.set_selection_upper_lower( gap_chars, true );
88 
89  // Process all sequences in the set.
90  for( auto const& seq : set ) {
91  if( seq.length() != result.size() ) {
92  throw std::runtime_error(
93  "Cannot calculate gap_sites() if SequenceSet is not an alignment."
94  );
95  }
96 
97  // Process the sites of the sequence. If it is not a gap, set it to false in the bitvector.
98  // This way, only sites that are all-gap will remain with value `true` in the end.
99  for( size_t i = 0; i < seq.length(); ++i ) {
100  if( ! lookup[ seq[ i ] ] ) {
101  result.set( i, false );
102  }
103  }
104  }
105 
106  return result;
107 }
108 
109 bool validate_chars( SequenceSet const& set, std::string const& chars )
110 {
111  // Init array to false, then set all necessary chars to true.
112  auto lookup = utils::CharLookup<bool>( false );
113  lookup.set_selection_upper_lower( chars, true );
114 
115  for( auto& s : set ) {
116  for( auto& c : s ) {
117  // get rid of this check and leave it to the parser/lexer/stream iterator
118  if( c < 0 ) {
119  return false;
120  }
121  assert( c >= 0 );
122  if( ! lookup[ c ] ) {
123  return false;
124  }
125  }
126  }
127  return true;
128 }
129 
130 // -------------------------------------------------------------------------
131 // Length and length checks
132 // -------------------------------------------------------------------------
133 
135 {
136  size_t max = 0;
137  for( auto const& seq : set ) {
138  max = std::max( max, seq.length() );
139  }
140  return max;
141 }
142 
143 size_t total_length( SequenceSet const& set )
144 {
145  return std::accumulate( set.begin(), set.end(), 0,
146  [] (size_t c, Sequence const& s) {
147  return c + s.length();
148  }
149  );
150 }
151 
152 bool is_alignment( SequenceSet const& set )
153 {
154  if( set.size() == 0 ) {
155  return true;
156  }
157 
158  size_t length = set[0].length();
159  for( auto& s : set ) {
160  if( s.length() != length ) {
161  return false;
162  }
163  }
164  return true;
165 }
166 
167 // =================================================================================================
168 // Modifiers
169 // =================================================================================================
170 
172 {
173  if( seq.length() != sites.size() ) {
174  throw std::runtime_error(
175  "Cannot remove sites from Sequence. "
176  "Given Bitvector has not the same size as the Sequence."
177  );
178  }
179 
180  auto const num_sites = sites.size() - sites.count();
181  std::string result;
182  result.reserve( num_sites );
183 
184  for( size_t i = 0; i < sites.size(); ++i ) {
185  if( ! sites[i] ) {
186  result += seq[i];
187  }
188  }
189 
190  seq.sites( result );
191 }
192 
194 {
195  for( auto const& seq : set ) {
196  if( seq.length() != sites.size() ) {
197  throw std::runtime_error(
198  "Cannot remove sites from SequenceSet. "
199  "Given Bitvector has not the same size as the Sequences."
200  );
201  }
202  }
203 
204  for( auto& seq : set ) {
205  remove_sites( seq, sites );
206  }
207 }
208 
209 void remove_gap_sites( SequenceSet& set, std::string const& gap_chars )
210 {
211  auto sites = gap_sites( set, gap_chars );
212  remove_sites( set, sites );
213 }
214 
215 void remove_characters( Sequence& seq, std::string const& search )
216 {
217  auto is_search_char = [&] ( char c ) {
218  return search.find( c ) != std::string::npos;
219  };
220 
221  auto& str = seq.sites();
222  str.erase( std::remove_if( str.begin(), str.end(), is_search_char ), str.end() );
223 }
224 
225 void remove_characters( SequenceSet& set, std::string const& search )
226 {
227  for( auto& sequence : set ) {
228  remove_characters( sequence, search );
229  }
230 }
231 
232 void remove_all_gaps( Sequence& seq, std::string const& gap_chars )
233 {
234  remove_characters( seq, gap_chars );
235 }
236 
237 void remove_all_gaps( SequenceSet& set, std::string const& gap_chars )
238 {
239  remove_characters( set, gap_chars );
240 }
241 
242 void replace_characters( Sequence& seq, std::string const& search, char replacement )
243 {
244  seq.sites() = utils::replace_all_chars( seq.sites(), search, replacement );
245 }
246 
247 void replace_characters( SequenceSet& set, std::string const& search, char replacement )
248 {
249  for( auto& sequence : set ) {
250  replace_characters( sequence, search, replacement );
251  }
252 }
253 
255 {
256  for( auto& c : seq.sites() ) {
257  if( c == 'U' ) {
258  c = 'T';
259  }
260  if( c == 'u' ) {
261  c = 't';
262  }
263  }
264 }
265 
267 {
268  for( auto& sequence : set ) {
269  replace_u_with_t( sequence );
270  }
271 }
272 
274 {
275  for( auto& c : seq.sites() ) {
276  if( c == 'T' ) {
277  c = 'U';
278  }
279  if( c == 't' ) {
280  c = 'u';
281  }
282  }
283 }
284 
286 {
287  for( auto& sequence : set ) {
288  replace_t_with_u( sequence );
289  }
290 }
291 
293  SequenceSet& set,
295  std::string const& counter_prefix
296 ) {
297  // Helper to keep track of which sequences (at position i in the set) occured how often.
298  struct Duplicate
299  {
300  size_t index;
301  size_t count;
302  };
303 
304  // Find duplicates and count their occurences.
305  // TODO this is a very memory intense step. find an algo that does not need to copy all sites...
306  std::unordered_map< std::string, Duplicate > dup_map;
307  size_t i = 0;
308  while( i < set.size() ) {
309  auto& seq = set[i];
310 
311  if( dup_map.count( seq.sites() ) == 0 ) {
312 
313  // If it is a new sequence, init the map entry and move to the next sequence.
314  dup_map[ seq.sites() ].index = i;
315  dup_map[ seq.sites() ].count = 1;
316  ++i;
317 
318  } else {
319 
320  // If we already saw that sequence, increment its counter, and remove the sequence
321  // from the set. Do not increment i - we deleted a sequence, so staying at the
322  // position automatically "moves" to the next one.
323  ++dup_map[ seq.sites() ].count;
324  set.remove(i);
325  }
326  }
327 
328  // We do not need to relabel sequences.
329  if( count_policy == MergeDuplicateSequencesCountPolicy::kDiscard ) {
330  return;
331  }
332 
333  // Relabel using the counts.
334  for( size_t i = 0; i < set.size(); ++i ) {
335  auto& seq = set[i];
336 
337  // The sequence needs to be in the map, as we added it in the previous step.
338  // It also needs to have the same index, as we never changed that.
339  assert( dup_map.count(seq.sites()) > 0 );
340  assert( dup_map[ seq.sites() ].index == i );
341 
342  // Append the count to either the label or the metadata.
343  auto count = dup_map[ seq.sites() ].count;
345  auto new_label = seq.label() + counter_prefix + std::to_string(count);
346  seq.label( new_label );
347 
348  } else if( count_policy == MergeDuplicateSequencesCountPolicy::kAppendToMetadata ) {
349  auto new_meta = seq.metadata() + counter_prefix + std::to_string(count);
350  seq.metadata( new_meta );
351 
352  } else {
353 
354  // We already skipped the discard case, so this here should not happen.
355  assert( false );
356  }
357  }
358 }
359 
360 // =================================================================================================
361 // Filters
362 // =================================================================================================
363 
364 void filter_min_sequence_length( SequenceSet& set, size_t min_length )
365 {
366  size_t index = 0;
367  while( index < set.size() ) {
368  if( set.at(index).length() < min_length ) {
369  set.remove(index);
370  } else {
371  ++index;
372  }
373  }
374 }
375 
376 void filter_max_sequence_length( SequenceSet& set, size_t max_length )
377 {
378  size_t index = 0;
379  while( index < set.size() ) {
380  if( set.at(index).length() > max_length ) {
381  set.remove(index);
382  } else {
383  ++index;
384  }
385  }
386 }
387 
388 void filter_min_max_sequence_length( SequenceSet& set, size_t min_length, size_t max_length )
389 {
390  size_t index = 0;
391  while( index < set.size() ) {
392  auto len = set.at(index).length();
393  if( len < min_length || len > max_length ) {
394  set.remove(index);
395  } else {
396  ++index;
397  }
398  }
399 }
400 
401 // =================================================================================================
402 // Print and Output
403 // =================================================================================================
404 
405 // -------------------------------------------------------------------------
406 // Ostream
407 // -------------------------------------------------------------------------
408 
409 std::ostream& operator << ( std::ostream& out, Sequence const& seq )
410 {
411  auto printer = PrinterSimple();
412  printer.length_limit( 100 );
413 
414  printer.print( out, seq );
415  return out;
416 }
417 
418 std::ostream& operator << ( std::ostream& out, SequenceSet const& set )
419 {
420  auto printer = PrinterSimple();
421  printer.length_limit( 100 );
422  printer.sequence_limit( 10 );
423 
424  printer.print( out, set );
425  return out;
426 }
427 
428 } // namespace sequence
429 } // namespace genesis
reference at(size_t index)
void remove_characters(Sequence &seq, std::string const &search)
Remove all of the characters in search from the sites of the Sequence.
size_t size() const
Returns the size (number of total bits) of this Bitvector.
Definition: bitvector.hpp:105
size_t count() const
Counts the number of set bits in the Bitvector.
Definition: bitvector.cpp:226
The counts are appended to the sequence label, separated by the counter_prefix.
utils::Bitvector gap_sites(Sequence const &seq, std::string const &gap_chars)
Return a Bitvector that is true where the Sequence has a gap and false where not. ...
size_t length() const
Return the length (number of sites) of this sequence.
Definition: sequence.cpp:107
std::string const & sites() const
Definition: sequence.cpp:72
void filter_min_sequence_length(SequenceSet &set, size_t min_length)
Remove all Sequences from the SequenceSet whose length is below the given min_length.
void set(size_t index)
Sets the value of a single bit to true, with boundary check.
Definition: bitvector.hpp:135
The counts are appended to the sequence metadata, separated by the counter_prefix.
void remove_gap_sites(SequenceSet &set, std::string const &gap_chars)
Remove all sites that only contain gap characters from the SequenceSet.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
void remove_sites(Sequence &seq, utils::Bitvector sites)
Remove all sites from a Sequence where the given Bitvector is true, and keep all others.
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
Definition: string.cpp:200
bool validate_chars(SequenceSet const &set, std::string const &chars)
Returns true iff all Sequences only consist of the given chars.
void filter_max_sequence_length(SequenceSet &set, size_t max_length)
Remove all Sequences from the SequenceSet whose length is above the given max_length.
void replace_t_with_u(Sequence &seq)
Replace all occurrences of T by U in the sites of the Sequence.
void replace_u_with_t(Sequence &seq)
Replace all occurrences of U by T in the sites of the Sequence.
void filter_min_max_sequence_length(SequenceSet &set, size_t min_length, size_t max_length)
Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length...
std::ostream & operator<<(std::ostream &out, Sequence const &seq)
Print a Sequence to an ostream in the form "label: sites".
void merge_duplicate_sequences(SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy, std::string const &counter_prefix)
Merge all Sequences in a SequenceSet that have identical sites.
MergeDuplicateSequencesCountPolicy
Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences...
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
Provides some commonly used string utility functions.
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
void remove_all_gaps(Sequence &seq, std::string const &gap_chars)
Remove all gap characters from the sites of the Sequence.
Store a set of Sequences.
Provides easy and fast logging functionality.
bool is_alignment(SequenceSet const &set)
Return true iff all Sequences in the SequenceSet have the same length.
void set_selection_upper_lower(std::string const &chars, T value)
Set the lookup status for both the upper and lower case of all chars that are contained in a given st...
size_t longest_sequence_length(SequenceSet const &set)
Return the length of the longest Sequence in the SequenceSet.
Simple printer class for Sequences and SequenceSets.
Definition: simple.hpp:61
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
void replace_characters(Sequence &seq, std::string const &search, char replacement)
Replace all occurences of the chars in search by the replace char, for all sites in the given Sequenc...