|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
50 #include <unordered_map>
51 #include <unordered_set>
62 std::string
const& chars
66 lookup.set_selection_upper_lower( chars,
true );
76 for(
size_t i = 0; i < seq.
length(); ++i ) {
77 result.set( i, chars[ seq[ i ] ] );
90 if( set.
size() == 0 ) {
99 lookup.set_selection_upper_lower( gap_chars,
true );
102 for(
auto const& seq : set ) {
103 if( seq.length() != result.size() ) {
104 throw std::runtime_error(
105 "Cannot calculate gap_sites() if SequenceSet is not an alignment."
111 for(
size_t i = 0; i < seq.length(); ++i ) {
112 if( ! lookup[ seq[ i ] ] ) {
113 result.set( i,
false );
125 lookup.set_selection_upper_lower( chars,
true );
127 for(
auto& s : set ) {
134 if( ! lookup[ c ] ) {
149 for(
auto const& seq : set ) {
150 max = std::max( max, seq.length() );
157 return std::accumulate( set.
begin(), set.
end(), 0,
159 return c + s.length();
166 if( set.
size() == 0 ) {
170 size_t length = set[0].length();
171 for(
auto& s : set ) {
172 if( s.length() !=
length ) {
186 throw std::runtime_error(
187 "Cannot remove sites from Sequence. "
188 "Given Bitvector has not the same size as the Sequence."
192 auto const num_sites = sites.
size() - sites.
count();
194 result.reserve( num_sites );
196 for(
size_t i = 0; i < sites.
size(); ++i ) {
207 for(
auto const& seq : set ) {
208 if( seq.length() != sites.
size() ) {
209 throw std::runtime_error(
210 "Cannot remove sites from SequenceSet. "
211 "Given Bitvector has not the same size as the Sequences."
216 for(
auto& seq : set ) {
223 auto sites =
gap_sites( set, gap_chars );
229 auto is_search_char = [&] (
char c ) {
230 return search.find( c ) != std::string::npos;
233 auto& str = seq.
sites();
234 str.erase( std::remove_if( str.begin(), str.end(), is_search_char ), str.end() );
239 for(
auto& sequence : set ) {
261 for(
auto& sequence : set ) {
268 for(
auto& c : seq.
sites() ) {
280 for(
auto& sequence : set ) {
287 for(
auto& c : seq.
sites() ) {
299 for(
auto& sequence : set ) {
307 std::string
const& counter_prefix
318 std::unordered_map< std::string, Duplicate > dup_map;
320 while( i < set.
size() ) {
323 if( dup_map.count( seq.sites() ) == 0 ) {
326 dup_map[ seq.sites() ].index = i;
327 dup_map[ seq.sites() ].count = 1;
335 ++dup_map[ seq.sites() ].count;
346 for(
size_t j = 0; j < set.
size(); ++j ) {
351 assert( dup_map.count(seq.sites()) > 0 );
352 assert( dup_map[ seq.sites() ].index == j );
355 auto count = dup_map[ seq.sites() ].count;
357 auto new_label = seq.label() + counter_prefix +
std::to_string(count);
358 seq.label( new_label );
374 for(
auto& c : sequence.
sites() ) {
381 for(
auto& sequence : sequence_set ) {
388 for(
auto& c : sequence.
sites() ) {
395 for(
auto& sequence : sequence_set ) {
407 while( index < set.
size() ) {
408 if( set.
at(index).
length() < min_length ) {
419 while( index < set.
size() ) {
420 if( set.
at(index).
length() > max_length ) {
431 while( index < set.
size() ) {
433 if( len < min_length || len > max_length ) {
452 printer.length_limit( 100 );
454 printer.print( out, seq );
461 printer.length_limit( 100 );
462 printer.sequence_limit( 10 );
464 printer.print( out, set );
void normalize_amino_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_amino_acid_code() for each site of the Sequence.
void remove_characters(Sequence &seq, std::string const &search)
Remove all of the characters in search from the sites of the Sequence.
void normalize_nucleic_acid_codes(Sequence &sequence, bool accept_degenerated)
Call normalize_nucleic_acid_code() for each site of the Sequence.
void replace_t_with_u(Sequence &seq)
Replace all occurrences of T by U in the sites of the Sequence.
void replace_u_with_t(Sequence &seq)
Replace all occurrences of U by T in the sites of the Sequence.
void remove_sites(Sequence &seq, utils::Bitvector sites)
Remove all sites from a Sequence where the given Bitvector is true, and keep all others.
@ kAppendToLabel
The counts are appended to the sequence label, separated by the counter_prefix.
std::ostream & operator<<(std::ostream &out, Sequence const &seq)
Print a Sequence to an ostream in the form "label: sites".
void filter_max_sequence_length(SequenceSet &set, size_t max_length)
Remove all Sequences from the SequenceSet whose length is above the given max_length.
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
MergeDuplicateSequencesCountPolicy
Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences.
std::string replace_all_chars(std::string const &text, std::string const &search_chars, char replace)
Replace all occurrences of the search_chars in text by the replace char.
void filter_min_max_sequence_length(SequenceSet &set, size_t min_length, size_t max_length)
Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length...
Simple printer class for Sequences and SequenceSets.
utils::Bitvector gap_sites(Sequence const &seq, std::string const &gap_chars)
Return a Bitvector that is true where the Sequence has a gap and false where not.
std::string to_string(GenomeLocus const &locus)
bool validate_chars(SequenceSet const &set, std::string const &chars)
Returns true iff all Sequences only consist of the given chars.
Provides some commonly used string utility functions.
Provides easy and fast logging functionality.
void filter_min_sequence_length(SequenceSet &set, size_t min_length)
Remove all Sequences from the SequenceSet whose length is below the given min_length.
char normalize_nucleic_acid_code(char code, bool accept_degenerated)
Normalize a nucleic acide code.
size_t length() const
Return the length (number of sites) of this sequence.
void merge_duplicate_sequences(SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy, std::string const &counter_prefix)
Merge all Sequences in a SequenceSet that have identical sites.
size_t longest_sequence_length(SequenceSet const &set)
Return the length of the longest Sequence in the SequenceSet.
void remove(size_t index)
Remove the Sequence at a given index from the SequenceSet.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Store a set of Sequences.
void remove_gap_sites(SequenceSet &set, std::string const &gap_chars)
Remove all sites that only contain gap characters from the SequenceSet.
char normalize_amino_acid_code(char code, bool accept_degenerated)
Normalize an amino acid code.
utils::Bitvector find_sites(Sequence const &seq, std::string const &chars)
Find sites by character and mark them in a Bitvector.
@ kDiscard
The counts are discarded.
void replace_characters(Sequence &seq, std::string const &search, char replacement)
Replace all occurences of the chars in search by the replace char, for all sites in the given Sequenc...
size_t count() const
Count the number of set bits in the Bitvector, that is, its Hamming weight, or population count (popc...
reference at(size_t index)
size_t size() const
Return the size (number of bits) of this Bitvector.
bool is_alignment(SequenceSet const &set)
Return true iff all Sequences in the SequenceSet have the same length.
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
void remove_all_gaps(Sequence &seq, std::string const &gap_chars)
Remove all gap characters from the sites of the Sequence.