A library for working with phylogenetic and population genetic data.
v0.27.0
sequence/functions/functions.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_FUNCTIONS_FUNCTIONS_H_
2 #define GENESIS_SEQUENCE_FUNCTIONS_FUNCTIONS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
37 
38 #include <iosfwd>
39 #include <map>
40 #include <string>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Forwad Declarations
47 // =================================================================================================
48 
49 class Sequence;
50 class SequenceSet;
51 
52 // =================================================================================================
53 // Characteristics
54 // =================================================================================================
55 
63 utils::Bitvector find_sites(
64  Sequence const& seq,
65  std::string const& chars
66 );
67 
75 utils::Bitvector find_sites(
76  Sequence const& seq,
77  utils::CharLookup<bool> const& chars
78 );
79 
88 utils::Bitvector gap_sites(
89  Sequence const& seq,
90  std::string const& gap_chars = nucleic_acid_codes_undetermined()
91 );
92 
102 utils::Bitvector gap_sites(
103  SequenceSet const& set,
104  std::string const& gap_chars = nucleic_acid_codes_undetermined()
105 );
106 
117 bool validate_chars( SequenceSet const& set, std::string const& chars );
118 
122 size_t longest_sequence_length( SequenceSet const& set );
123 
127 size_t total_length( SequenceSet const& set );
128 
132 bool is_alignment( SequenceSet const& set );
133 
134 // =================================================================================================
135 // Modifiers
136 // =================================================================================================
137 
146 void remove_sites( Sequence& seq, utils::Bitvector sites );
147 
158 void remove_sites( SequenceSet& set, utils::Bitvector sites );
159 
163 void remove_gap_sites( SequenceSet& set, std::string const& gap_chars = nucleic_acid_codes_undetermined() );
164 
168 void remove_characters( Sequence& seq, std::string const& search );
169 
174 void remove_characters( SequenceSet& set, std::string const& search );
175 
182 void remove_all_gaps( Sequence& seq, std::string const& gap_chars = nucleic_acid_codes_undetermined() );
183 
190 void remove_all_gaps( SequenceSet& set, std::string const& gap_chars = nucleic_acid_codes_undetermined() );
191 
200 void replace_characters( Sequence& seq, std::string const& search, char replacement );
201 
210 void replace_characters( SequenceSet& set, std::string const& search, char replacement );
211 
218 void replace_u_with_t( Sequence& seq );
219 
226 void replace_u_with_t( SequenceSet& set );
227 
234 void replace_t_with_u( Sequence& seq );
235 
242 void replace_t_with_u( SequenceSet& set );
243 
252 {
256  kDiscard,
257 
262 
263 };
264 
278  SequenceSet& set,
280  std::string const& counter_prefix = "_"
281 );
282 
283 // =================================================================================================
284 // Normalization
285 // =================================================================================================
286 
292 void normalize_nucleic_acid_codes( Sequence& sequence, bool accept_degenerated = true );
293 
299 void normalize_nucleic_acid_codes( SequenceSet& sequence_set, bool accept_degenerated = true );
300 
306 void normalize_amino_acid_codes( Sequence& sequence, bool accept_degenerated = true );
307 
313 void normalize_amino_acid_codes( SequenceSet& sequence_set, bool accept_degenerated = true );
314 
315 // =================================================================================================
316 // Filters
317 // =================================================================================================
318 
325 void filter_min_sequence_length( SequenceSet& set, size_t min_length );
326 
333 void filter_max_sequence_length( SequenceSet& set, size_t max_length );
334 
342 void filter_min_max_sequence_length( SequenceSet& set, size_t min_length, size_t max_length );
343 
344 // =================================================================================================
345 // Print and Output
346 // =================================================================================================
347 
354 std::ostream& operator << ( std::ostream& out, Sequence const& seq );
355 
363 std::ostream& operator << ( std::ostream& out, SequenceSet const& set );
364 
365 } // namespace sequence
366 } // namespace genesis
367 
368 #endif // include guard
genesis::sequence::normalize_amino_acid_codes
void normalize_amino_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_amino_acid_code() for each site of the Sequence.
Definition: sequence/functions/functions.cpp:386
genesis::sequence::remove_characters
void remove_characters(Sequence &seq, std::string const &search)
Remove all of the characters in search from the sites of the Sequence.
Definition: sequence/functions/functions.cpp:227
genesis::sequence::normalize_nucleic_acid_codes
void normalize_nucleic_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_nucleic_acid_code() for each site of the Sequence.
Definition: sequence/functions/functions.cpp:372
genesis::sequence::replace_t_with_u
void replace_t_with_u(Sequence &seq)
Replace all occurrences of T by U in the sites of the Sequence.
Definition: sequence/functions/functions.cpp:285
genesis::sequence::replace_u_with_t
void replace_u_with_t(Sequence &seq)
Replace all occurrences of U by T in the sites of the Sequence.
Definition: sequence/functions/functions.cpp:266
genesis::sequence::remove_sites
void remove_sites(Sequence &seq, utils::Bitvector sites)
Remove all sites from a Sequence where the given Bitvector is true, and keep all others.
Definition: sequence/functions/functions.cpp:183
genesis::sequence::MergeDuplicateSequencesCountPolicy::kAppendToLabel
@ kAppendToLabel
The counts are appended to the sequence label, separated by the counter_prefix.
genesis::sequence::operator<<
std::ostream & operator<<(std::ostream &out, Sequence const &seq)
Print a Sequence to an ostream in the form "label: sites".
Definition: sequence/functions/functions.cpp:449
genesis::sequence::filter_max_sequence_length
void filter_max_sequence_length(SequenceSet &set, size_t max_length)
Remove all Sequences from the SequenceSet whose length is above the given max_length.
Definition: sequence/functions/functions.cpp:416
genesis::sequence::MergeDuplicateSequencesCountPolicy
MergeDuplicateSequencesCountPolicy
Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences.
Definition: sequence/functions/functions.hpp:251
genesis::sequence::filter_min_max_sequence_length
void filter_min_max_sequence_length(SequenceSet &set, size_t min_length, size_t max_length)
Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length...
Definition: sequence/functions/functions.cpp:428
genesis::sequence::gap_sites
utils::Bitvector gap_sites(Sequence const &seq, std::string const &gap_chars)
Return a Bitvector that is true where the Sequence has a gap and false where not.
Definition: sequence/functions/functions.cpp:82
genesis::sequence::validate_chars
bool validate_chars(SequenceSet const &set, std::string const &chars)
Returns true iff all Sequences only consist of the given chars.
Definition: sequence/functions/functions.cpp:121
char_lookup.hpp
genesis::sequence::filter_min_sequence_length
void filter_min_sequence_length(SequenceSet &set, size_t min_length)
Remove all Sequences from the SequenceSet whose length is below the given min_length.
Definition: sequence/functions/functions.cpp:404
genesis::sequence::nucleic_acid_codes_undetermined
std::string nucleic_acid_codes_undetermined()
Return all undetermined nucleic acid codes. Those are NOX.-?.
Definition: codes.cpp:305
genesis::sequence::merge_duplicate_sequences
void merge_duplicate_sequences(SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy, std::string const &counter_prefix)
Merge all Sequences in a SequenceSet that have identical sites.
Definition: sequence/functions/functions.cpp:304
genesis::sequence::longest_sequence_length
size_t longest_sequence_length(SequenceSet const &set)
Return the length of the longest Sequence in the SequenceSet.
Definition: sequence/functions/functions.cpp:146
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::remove_gap_sites
void remove_gap_sites(SequenceSet &set, std::string const &gap_chars)
Remove all sites that only contain gap characters from the SequenceSet.
Definition: sequence/functions/functions.cpp:221
genesis::sequence::find_sites
utils::Bitvector find_sites(Sequence const &seq, std::string const &chars)
Find sites by character and mark them in a Bitvector.
Definition: sequence/functions/functions.cpp:60
genesis::sequence::MergeDuplicateSequencesCountPolicy::kDiscard
@ kDiscard
The counts are discarded.
genesis::sequence::replace_characters
void replace_characters(Sequence &seq, std::string const &search, char replacement)
Replace all occurences of the chars in search by the replace char, for all sites in the given Sequenc...
Definition: sequence/functions/functions.cpp:254
bitvector.hpp
codes.hpp
genesis::sequence::is_alignment
bool is_alignment(SequenceSet const &set)
Return true iff all Sequences in the SequenceSet have the same length.
Definition: sequence/functions/functions.cpp:164
genesis::sequence::total_length
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
Definition: sequence/functions/functions.cpp:155
genesis::sequence::remove_all_gaps
void remove_all_gaps(Sequence &seq, std::string const &gap_chars)
Remove all gap characters from the sites of the Sequence.
Definition: sequence/functions/functions.cpp:244