A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sequence/functions/functions.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_FUNCTIONS_FUNCTIONS_H_
2 #define GENESIS_SEQUENCE_FUNCTIONS_FUNCTIONS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
37 
38 #include <iosfwd>
39 #include <map>
40 #include <string>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Forwad Declarations
47 // =================================================================================================
48 
49 class Sequence;
50 class SequenceSet;
51 
52 // =================================================================================================
53 // Characteristics
54 // =================================================================================================
55 
63 utils::Bitvector find_sites(
64  Sequence const& seq,
65  std::string const& chars
66 );
67 
75 utils::Bitvector find_sites(
76  Sequence const& seq,
77  utils::CharLookup<bool> const& chars
78 );
79 
88 utils::Bitvector gap_sites(
89  Sequence const& seq,
90  std::string const& gap_chars = nucleic_acid_codes_undetermined()
91 );
92 
102 utils::Bitvector gap_sites(
103  SequenceSet const& set,
104  std::string const& gap_chars = nucleic_acid_codes_undetermined()
105 );
106 
117 bool validate_chars( SequenceSet const& set, std::string const& chars );
118 
122 size_t longest_sequence_length( SequenceSet const& set );
123 
127 size_t total_length( SequenceSet const& set );
128 
132 bool is_alignment( SequenceSet const& set );
133 
134 // =================================================================================================
135 // Modifiers
136 // =================================================================================================
137 
146 void remove_sites( Sequence& seq, utils::Bitvector sites );
147 
158 void remove_sites( SequenceSet& set, utils::Bitvector sites );
159 
163 void remove_gap_sites( SequenceSet& set, std::string const& gap_chars = nucleic_acid_codes_undetermined() );
164 
168 void remove_characters( Sequence& seq, std::string const& search );
169 
174 void remove_characters( SequenceSet& set, std::string const& search );
175 
182 void remove_all_gaps( Sequence& seq, std::string const& gap_chars = nucleic_acid_codes_undetermined() );
183 
190 void remove_all_gaps( SequenceSet& set, std::string const& gap_chars = nucleic_acid_codes_undetermined() );
191 
200 void replace_characters( Sequence& seq, std::string const& search, char replacement );
201 
210 void replace_characters( SequenceSet& set, std::string const& search, char replacement );
211 
218 void replace_u_with_t( Sequence& seq );
219 
226 void replace_u_with_t( SequenceSet& set );
227 
234 void replace_t_with_u( Sequence& seq );
235 
242 void replace_t_with_u( SequenceSet& set );
243 
252 {
256  kDiscard,
257 
262 
263 };
264 
278  SequenceSet& set,
280  std::string const& counter_prefix = "_"
281 );
282 
283 // =================================================================================================
284 // Normalization
285 // =================================================================================================
286 
292 void normalize_nucleic_acid_codes( Sequence& sequence, bool accept_degenerated = true );
293 
299 void normalize_nucleic_acid_codes( SequenceSet& sequence_set, bool accept_degenerated = true );
300 
306 void normalize_amino_acid_codes( Sequence& sequence, bool accept_degenerated = true );
307 
313 void normalize_amino_acid_codes( SequenceSet& sequence_set, bool accept_degenerated = true );
314 
315 // =================================================================================================
316 // Filters
317 // =================================================================================================
318 
325 void filter_min_sequence_length( SequenceSet& set, size_t min_length );
326 
333 void filter_max_sequence_length( SequenceSet& set, size_t max_length );
334 
342 void filter_min_max_sequence_length( SequenceSet& set, size_t min_length, size_t max_length );
343 
344 // =================================================================================================
345 // Print and Output
346 // =================================================================================================
347 
354 std::ostream& operator << ( std::ostream& out, Sequence const& seq );
355 
363 std::ostream& operator << ( std::ostream& out, SequenceSet const& set );
364 
365 } // namespace sequence
366 } // namespace genesis
367 
368 #endif // include guard
void remove_characters(Sequence &seq, std::string const &search)
Remove all of the characters in search from the sites of the Sequence.
The counts are appended to the sequence label, separated by the counter_prefix.
utils::Bitvector gap_sites(Sequence const &seq, std::string const &gap_chars)
Return a Bitvector that is true where the Sequence has a gap and false where not. ...
void filter_min_sequence_length(SequenceSet &set, size_t min_length)
Remove all Sequences from the SequenceSet whose length is below the given min_length.
void normalize_amino_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_amino_acid_code() for each site of the Sequence.
void remove_gap_sites(SequenceSet &set, std::string const &gap_chars)
Remove all sites that only contain gap characters from the SequenceSet.
void remove_sites(Sequence &seq, utils::Bitvector sites)
Remove all sites from a Sequence where the given Bitvector is true, and keep all others.
bool validate_chars(SequenceSet const &set, std::string const &chars)
Returns true iff all Sequences only consist of the given chars.
std::string nucleic_acid_codes_undetermined()
Return all undetermined nucleic acid codes. Those are "NOX.-?".
Definition: codes.cpp:304
void filter_max_sequence_length(SequenceSet &set, size_t max_length)
Remove all Sequences from the SequenceSet whose length is above the given max_length.
utils::Bitvector find_sites(Sequence const &seq, std::string const &chars)
Find sites by character and mark them in a Bitvector.
void replace_t_with_u(Sequence &seq)
Replace all occurrences of T by U in the sites of the Sequence.
void replace_u_with_t(Sequence &seq)
Replace all occurrences of U by T in the sites of the Sequence.
void filter_min_max_sequence_length(SequenceSet &set, size_t min_length, size_t max_length)
Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length...
std::ostream & operator<<(std::ostream &out, Sequence const &seq)
Print a Sequence to an ostream in the form "label: sites".
void merge_duplicate_sequences(SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy, std::string const &counter_prefix)
Merge all Sequences in a SequenceSet that have identical sites.
MergeDuplicateSequencesCountPolicy
Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences...
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
void remove_all_gaps(Sequence &seq, std::string const &gap_chars)
Remove all gap characters from the sites of the Sequence.
bool is_alignment(SequenceSet const &set)
Return true iff all Sequences in the SequenceSet have the same length.
size_t longest_sequence_length(SequenceSet const &set)
Return the length of the longest Sequence in the SequenceSet.
void normalize_nucleic_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_nucleic_acid_code() for each site of the Sequence.
void replace_characters(Sequence &seq, std::string const &search, char replacement)
Replace all occurences of the chars in search by the replace char, for all sites in the given Sequenc...