A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
genesis::sequence Namespace Reference

Classes

class  FastaInputIterator
 Iterate an input stream and parse it as Fasta sequences. More...
 
class  FastaOutputIterator
 
class  FastaReader
 Read Fasta sequence data. More...
 
class  FastaWriter
 Write Fasta data. More...
 
class  PhylipReader
 Read Phylip sequence data. More...
 
class  PhylipWriter
 
class  PrinterBitmap
 Print the sites of a SequenceSet as pixels in a bitmap. More...
 
class  PrinterSimple
 Simple printer class for Sequences and SequenceSets. More...
 
class  Sequence
 
class  SequenceSet
 Store a set of Sequences. More...
 
class  SignatureSpecifications
 Specifications for calculating signatures (like k-mer counts) from Sequences. More...
 
class  SiteCounts
 Store counts of the occurence for certain characters at the sites of Sequences. More...
 

Functions

double absolute_entropy (SiteCounts const &counts, SiteEntropyOptions per_site_options=SiteEntropyOptions::kDefault)
 Return the sum of all site entropies. More...
 
std::string amino_acid_codes_all ()
 Return all valid amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWYBJZX*-?". More...
 
std::string amino_acid_codes_degenerated ()
 Return all degenerated amino acid codes. Those are "BJZ". More...
 
std::string amino_acid_codes_plain ()
 Return all plain amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWY". More...
 
std::string amino_acid_codes_undetermined ()
 Return all undetermined amino acid codes. Those are "X*-?". More...
 
std::map< char, utils::Coloramino_acid_colors ()
 Return a map of Colors for each amino acid code. More...
 
std::string amino_acid_name (char code)
 Get the name of a amino acid given its IUPAC code. More...
 
std::map< char, std::string > amino_acid_text_colors ()
 Return a map of text colors for each amino acid code. More...
 
double averaged_entropy (SiteCounts const &counts, bool only_determined_sites=false, SiteEntropyOptions per_site_options=SiteEntropyOptions::kDefault)
 Return the averaged sum of all site entropies. More...
 
std::map< char, double > base_frequencies (Sequence const &seq, std::string const &plain_chars)
 Get the base frequencies of the sites in a Sequence given the base chars. More...
 
std::map< char, double > base_frequencies (SequenceSet const &set, std::string const &plain_chars)
 Get the base frequencies of the sites in a SequenceSet given the base chars. More...
 
std::map< char, double > base_frequencies_accumulator (std::map< char, size_t > const &sitehistogram, std::string const &plain_chars)
 Local helper function that turns a site histogram into base frequencies. More...
 
std::string consensus_sequence_cavener (SiteCounts const &counts, bool allow_gaps=true)
 Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities. More...
 
std::string consensus_sequence_cavener (SequenceSet const &sequences, bool allow_gaps=true)
 Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities. More...
 
template<typename CharSelector >
std::string consensus_sequence_template (SiteCounts const &counts, bool const allow_gaps, CharSelector const &char_selector)
 Local helper function template that handles the common code for the nucleic acid consensus sequence functions. More...
 
std::string consensus_sequence_with_ambiguities (SiteCounts const &counts, double similarity_factor=0.9, bool allow_gaps=true)
 Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities. More...
 
std::string consensus_sequence_with_ambiguities (SequenceSet const &sequences, double similarity_factor=0.9, bool allow_gaps=true)
 Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities. More...
 
std::string consensus_sequence_with_majorities (SiteCounts const &counts, bool allow_gaps=true, char gap_char= '-')
 Calculate the majority rule consensus sequence by using the most frequent character at each site. More...
 
std::string consensus_sequence_with_majorities (SequenceSet const &sequences, std::string const &characters, bool allow_gaps=true, char gap_char= '-')
 Calculate the majority rule consensus sequence by using the most frequent character at each site. More...
 
std::string consensus_sequence_with_majorities (SequenceSet const &sequences, bool allow_gaps=true)
 Calculate the majority rule consensus sequence by using the most frequent character at each site for nucleic acid codes ACGT. More...
 
std::string consensus_sequence_with_threshold (SiteCounts const &counts, double frequency_threshold=0.6, bool allow_gaps=true, bool use_ambiguities=true)
 Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT. More...
 
std::string consensus_sequence_with_threshold (SequenceSet const &sequences, double frequency_threshold=0.6, bool allow_gaps=true, bool use_ambiguities=true)
 Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT. More...
 
size_t count_chars (SequenceSet const &set, std::string const &chars)
 Count the number of occurrences of the given chars within the sites of the SequenceSet. More...
 
bool CountPairComparator (CountPair const &lhs, CountPair const &rhs)
 Local helper function to sort a CountPair. More...
 
void filter_by_label_list (SequenceSet &set, std::unordered_set< std::string > const &labels, bool invert=false)
 Remove all those Sequences from a SequenceSet whose labels are in the given list. More...
 
void filter_max_sequence_length (SequenceSet &set, size_t max_length)
 Remove all Sequences from the SequenceSet whose length is above the given max_length. More...
 
void filter_min_max_sequence_length (SequenceSet &set, size_t min_length, size_t max_length)
 Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length. More...
 
void filter_min_sequence_length (SequenceSet &set, size_t min_length)
 Remove all Sequences from the SequenceSet whose length is below the given min_length. More...
 
Sequence const * find_sequence (SequenceSet const &set, std::string const &label)
 Return a pointer to a Sequence with a specific label, or nullptr iff not found. More...
 
utils::Bitvector find_sites (Sequence const &seq, std::string const &chars)
 Find sites by character and mark them in a Bitvector. More...
 
utils::Bitvector find_sites (Sequence const &seq, utils::CharLookup< bool > const &chars)
 Find sites by character and mark them in a Bitvector. More...
 
size_t gap_site_count (SiteCounts const &counts)
 
utils::Bitvector gap_sites (Sequence const &seq, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Return a Bitvector that is true where the Sequence has a gap and false where not. More...
 
utils::Bitvector gap_sites (SequenceSet const &set, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Return a Bitvector that is true where all Sequences in the SequenceSet have a gap and false where not, that is, where at least on Sequence is not a gap. More...
 
double gapyness (SequenceSet const &set, std::string const &gap_chars)
 Return the "gapyness" of the Sequences, i.e., the proportion of gap chars and other completely undetermined chars to the total length of all sequences. More...
 
size_t guess_sequence_abundance (Sequence const &sequence)
 Guess the abundance of a Sequence, using it's label. More...
 
size_t guess_sequence_abundance (std::string const &label)
 Guess the abundance of a Sequence, given it's label. More...
 
bool has_unique_labels (SequenceSet const &set, bool case_sensitive=true)
 Return true iff all labels of the Sequences in the SequenceSet are unique. More...
 
bool has_valid_label (Sequence const &seq)
 Check whether a Sequence has a valid label. More...
 
bool has_valid_labels (SequenceSet const &set)
 Check whether all Sequences in a SequenceSet have valid labels. More...
 
bool is_alignment (SequenceSet const &set)
 Return true iff all Sequences in the SequenceSet have the same length. More...
 
bool is_valid_label (std::string const &label)
 Check whether a given string is a valid label for a Sequence. More...
 
std::string kmer_string_overlapping (Sequence const &sequence, SignatureSpecifications const &settings)
 Return the sequence spitted into overlapping k-mers. More...
 
void kmer_string_overlapping (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
 Print the sequence spitted into overlapping k-mers. More...
 
void kmer_string_overlapping_line (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
 Local helper function that writes an overlapping kmer string to a stream. More...
 
void kmer_string_single_kmer (Sequence const &sequence, SignatureSpecifications const &settings, size_t start, std::ostream &out)
 Local helper function that writes one kmer string to a stream. More...
 
std::vector< std::string > kmer_strings_non_overlapping (Sequence const &sequence, SignatureSpecifications const &settings)
 Return the sequence spitted into a set of non-overlapping k-mers. More...
 
void kmer_strings_non_overlapping (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
 Print the sequence spitted into non-overlapping k-mers. More...
 
void kmer_strings_non_overlapping_line (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out, size_t offset)
 Local helper function that does one line of a non overlapping kmer string. More...
 
std::unordered_set< std::string > labels (SequenceSet const &set)
 Return a set of all labels of the SequenceSet. More...
 
size_t longest_sequence_length (SequenceSet const &set)
 Return the length of the longest Sequence in the SequenceSet. More...
 
void merge_duplicate_sequences (SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy=MergeDuplicateSequencesCountPolicy::kDiscard, std::string const &counter_prefix="_")
 Merge all Sequences in a SequenceSet that have identical sites. More...
 
char normalize_amino_acid_code (char code, bool accept_degenerated=true)
 Normalize an amino acid code. More...
 
void normalize_amino_acid_codes (Sequence &sequence, bool accept_degenerated=true)
 Call normalize_amino_acid_code() for each site of the Sequence. More...
 
void normalize_amino_acid_codes (SequenceSet &sequence_set, bool accept_degenerated=true)
 Call normalize_amino_acid_code() for each site of all Sequences in the SequenceSet. More...
 
std::string normalize_code_alphabet (std::string const &alphabet)
 Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates. More...
 
char normalize_nucleic_acid_code (char code, bool accept_degenerated=true)
 Normalize a nucleic acide code. More...
 
void normalize_nucleic_acid_codes (Sequence &sequence, bool accept_degenerated=true)
 Call normalize_nucleic_acid_code() for each site of the Sequence. More...
 
void normalize_nucleic_acid_codes (SequenceSet &sequence_set, bool accept_degenerated=true)
 Call normalize_nucleic_acid_code() for each site of all Sequences in the SequenceSet. More...
 
std::string nucleic_acid_ambiguities (char code)
 Return the possible ambiguous nucleic acid codes for a given code char. More...
 
char nucleic_acid_ambiguity_code (std::string codes)
 Return the nucleic acid code that represents all given codes. More...
 
std::string nucleic_acid_codes_all ()
 Return all valid nucleic acid codes. Those are "ACGTUWSMKRYBDHVNOX.-?". More...
 
std::string nucleic_acid_codes_degenerated ()
 Return all degenerated nucleic acid codes. Those are "WSMKRYBDHV". More...
 
std::string nucleic_acid_codes_plain ()
 Return all plain nucleic acid codes. Those are "ACGTU". More...
 
std::string nucleic_acid_codes_undetermined ()
 Return all undetermined nucleic acid codes. Those are "NOX.-?". More...
 
std::map< char, utils::Colornucleic_acid_colors ()
 Return a map of Colors for each nucleic acid code. More...
 
std::string nucleic_acid_name (char code)
 Get the name of a nucleic acid given its IUPAC code. More...
 
std::map< char, std::string > nucleic_acid_text_colors ()
 Return a map of text colors for each nucleic acid code. More...
 
bool operator& (SiteEntropyOptions lhs, SiteEntropyOptions rhs)
 And-operator to check whether a SiteEntropyOptions is set. More...
 
std::ostream & operator<< (std::ostream &out, Sequence const &seq)
 Print a Sequence to an ostream in the form "label: sites". More...
 
std::ostream & operator<< (std::ostream &out, SequenceSet const &set)
 Print a SequenceSet to an ostream in the form "label: sites". More...
 
SiteEntropyOptions operator| (SiteEntropyOptions lhs, SiteEntropyOptions rhs)
 Or-operator to combine two SiteEntropyOptionss. More...
 
SiteEntropyOptionsoperator|= (SiteEntropyOptions &lhs, SiteEntropyOptions rhs)
 Or-assignment-operator to combine two SiteEntropyOptionss. More...
 
void relabel_md5 (Sequence &seq)
 Relabel the Sequence using the MD5 hash digest of its sites. More...
 
void relabel_md5 (SequenceSet &set)
 Relabel all Sequences in the SequenceSet using the MD5 hash digest of the sites. More...
 
void relabel_sha1 (Sequence &seq)
 Relabel the Sequence using the SHA1 hash digest of its sites. More...
 
void relabel_sha1 (SequenceSet &set)
 Relabel all Sequences in the SequenceSet using the SHA1 hash digest of the sites. More...
 
void relabel_sha256 (Sequence &seq)
 Relabel the Sequence using the SHA256 hash digest of its sites. More...
 
void relabel_sha256 (SequenceSet &set)
 Relabel all Sequences in the SequenceSet using the SHA256 hash digest of the sites. More...
 
void remove_all_gaps (Sequence &seq, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Remove all gap characters from the sites of the Sequence. More...
 
void remove_all_gaps (SequenceSet &set, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Remove all gap characters from the sites of the Sequences in the SequenceSet. More...
 
void remove_characters (Sequence &seq, std::string const &search)
 Remove all of the characters in search from the sites of the Sequence. More...
 
void remove_characters (SequenceSet &set, std::string const &search)
 Remove all of the characters in search from the sites of the Sequences in the SequenceSet. More...
 
void remove_gap_sites (SequenceSet &set, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Remove all sites that only contain gap characters from the SequenceSet. More...
 
void remove_sites (Sequence &seq, utils::Bitvector sites)
 Remove all sites from a Sequence where the given Bitvector is true, and keep all others. More...
 
void remove_sites (SequenceSet &set, utils::Bitvector sites)
 Remove all sites from all Sequences in a SequenceSet where the given Bitvector is true, and keep all others. More...
 
void replace_characters (Sequence &seq, std::string const &search, char replacement)
 Replace all occurences of the chars in search by the replace char, for all sites in the given Sequence. More...
 
void replace_characters (SequenceSet &set, std::string const &search, char replacement)
 Replace all occurences of the chars in search by the replace char, for all sites in the Sequences in the given SequenceSet. More...
 
void replace_t_with_u (Sequence &seq)
 Replace all occurrences of T by U in the sites of the Sequence. More...
 
void replace_t_with_u (SequenceSet &set)
 Replace all occurrences of T by U in the sites of all Sequences in the SequenceSet. More...
 
void replace_u_with_t (Sequence &seq)
 Replace all occurrences of U by T in the sites of the Sequence. More...
 
void replace_u_with_t (SequenceSet &set)
 Replace all occurrences of U by T in the sites of all Sequences in the SequenceSet. More...
 
std::string reverse_complement (std::string const &sequence, bool accept_degenerated=true)
 Get the reverse complement of a nucleic acid sequence. More...
 
std::string sanitize_label (std::string const &label)
 Sanitize a label by replacing all invalid characters with underscores. More...
 
void sanitize_label (Sequence &seq)
 Sanitize a label by replacing all invalid characters with underscores. More...
 
void sanitize_labels (SequenceSet &set)
 Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with underscores. More...
 
template<class Combinator >
std::vector< double > signature_complementarity_frequencies_helper (Sequence const &sequence, SignatureSpecifications const &settings, Combinator combinator)
 Local helper function that returns a vector where the frequencies of the non-palindromic kmers are combined using a functor. More...
 
std::vector< size_t > signature_counts (Sequence const &sequence, SignatureSpecifications const &settings)
 Count the occurences of k-mers in the sequence according to the settings. More...
 
std::vector< double > signature_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the frequencies of occurences of k-mers in the sequence according to the settings. More...
 
std::vector< double > signature_frequency_ratios_1 (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the ratio 1 signature of a sequence. More...
 
std::vector< double > signature_frequency_ratios_2 (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the ratio 2 signature of a sequence. More...
 
std::vector< double > signature_jensen_shannon (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the Jensen-Shannon (JS) signature of a sequence. More...
 
std::vector< double > signature_maximal_complementarity_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the signature of a sequence that uses the maximum frequency of reverse complement k-mers. More...
 
std::vector< double > signature_minimal_complementarity_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the signature of a sequence that uses the minimum frequency of reverse complement k-mers. More...
 
std::vector< size_t > signature_ranks (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the rank signature of a sequence according to the settings. More...
 
std::vector< double > signature_reverse_identity_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the signature of a sequence that uses only the frequencies of k-mers whose reverse complement is the k-mer itself. More...
 
std::vector< size_t > signature_symmetrized_counts (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the symmetrized counts of the sequence according to the settings. More...
 
std::vector< double > signature_symmetrized_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the symmetrized counts of the sequence according to the settings. More...
 
template<class T >
std::vector< T > signature_symmetrized_helper (std::vector< T > const &kmers, SignatureSpecifications const &settings)
 Local helper function that adds up the values for reverse complement k-mers. More...
 
std::vector< size_t > signature_symmetrized_ranks (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the symmetrized rank signature of a sequence according to the settings. More...
 
double site_entropy (SiteCounts const &counts, size_t site_index, SiteEntropyOptions options=SiteEntropyOptions::kDefault)
 Calculate the entropy at one site of a SiteCounts object. More...
 
std::map< char, size_t > site_histogram (Sequence const &seq)
 Get a histogram of the occurrences of particular sites, given a Sequence. More...
 
std::map< char, size_t > site_histogram (SequenceSet const &set)
 Get a histogram of the occurrences of particular sites, given a SequenceSet. More...
 
double site_information (SiteCounts const &counts, size_t site_index, bool use_small_sample_correction=false, SiteEntropyOptions options=SiteEntropyOptions::kDefault)
 Calculate the information content at one site of a SiteCounts object. More...
 
void swap (SequenceSet &lhs, SequenceSet &rhs)
 
size_t total_length (SequenceSet const &set)
 Return the total length (sum) of all Sequences in the SequenceSet. More...
 
bool validate_chars (SequenceSet const &set, std::string const &chars)
 Returns true iff all Sequences only consist of the given chars. More...
 

Enumerations

enum  MergeDuplicateSequencesCountPolicy { kDiscard, kAppendToLabel }
 Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences. More...
 
enum  SiteEntropyOptions : unsigned char { kDefault = 0, kIncludeGaps = 1, kWeighted = 2, kNormalized = 4 }
 Option flags to refine the calculation of site_entropy(). More...
 

Typedefs

using CountPair = std::pair< size_t, char >
 Helper struct to store characters sorted by their count. More...
 

Variables

static const
std::unordered_map< char,
std::string > 
amino_acid_code_to_name
 
static const std::map< char,
utils::Color
amino_acid_colors_map
 
static const std::map< char,
std::string > 
amino_acid_text_colors_map
 
static const
std::unordered_map< char,
std::string > 
nucleic_acid_ambiguity_char_map
 
static const
std::unordered_map
< std::string, char > 
nucleic_acid_ambiguity_string_map
 
static const
std::unordered_map< char,
std::string > 
nucleic_acid_code_to_name
 
static const std::map< char,
utils::Color
nucleic_acid_colors_map
 
static const std::map< char,
std::string > 
nucleic_acid_text_colors_map
 

Function Documentation

double absolute_entropy ( SiteCounts const &  counts,
SiteEntropyOptions  per_site_options = SiteEntropyOptions::kDefault 
)

Return the sum of all site entropies.

This function simply sums up up the site_entropy() for all sites of the SiteCounts object. The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 139 of file sequence/functions/entropy.cpp.

std::string amino_acid_codes_all ( )

Return all valid amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWYBJZX*-?".

Definition at line 335 of file codes.cpp.

std::string amino_acid_codes_degenerated ( )

Return all degenerated amino acid codes. Those are "BJZ".

Definition at line 325 of file codes.cpp.

std::string amino_acid_codes_plain ( )

Return all plain amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWY".

Definition at line 320 of file codes.cpp.

std::string amino_acid_codes_undetermined ( )

Return all undetermined amino acid codes. Those are "X*-?".

Definition at line 330 of file codes.cpp.

std::map< char, utils::Color > amino_acid_colors ( )

Return a map of Colors for each amino acid code.

This function gives a Color for each amino acid code.

Definition at line 577 of file codes.cpp.

std::string amino_acid_name ( char  code)

Get the name of a amino acid given its IUPAC code.

The codes are translated as follows:

A Alanine
B Aspartic acid or Asparagine
C Cysteine
D Aspartic acid
E Glutamic acid
F Phenylalanine
G Glycine
H Histidine
I Isoleucine
J Leucine or Isoleucine
K Lysine
L Leucine
M Methionine
N Asparagine
O Pyrrolysine
P Proline
Q Glutamine
R Arginine
S Serine
T Threonine
U Selenocysteine
V Valine
W Tryptophan
Y Tyrosine
Z Glutamic acid or Glutamine
X any
* translation stop
- gap
? gap

The code char is treated case-insensitive. If the given code char is not valid, an std::out_of_range exception is thrown.

Definition at line 595 of file codes.cpp.

std::map< char, std::string > amino_acid_text_colors ( )

Return a map of text colors for each amino acid code.

This function gives a color name usable for utils::Style for each amino acid code. The return value of this function can for example be used in sequence::print_color() function.

Definition at line 567 of file codes.cpp.

double averaged_entropy ( SiteCounts const &  counts,
bool  only_determined_sites = false,
SiteEntropyOptions  per_site_options = SiteEntropyOptions::kDefault 
)

Return the averaged sum of all site entropies.

This function sums up up the site_entropy() for all sites of the SiteCounts object and returns the average result per site.

If only_determined_sites is false (default), the average is calculated using the total number of sites, that is, it simply calculates the average entropy per site.

If only_determined_sites is true, the average is calculated using the number of determined sites only; that is, sites that only contain zeroes in all counts are skipped. Those sites do not contribute entropy anyway. Thus, it calcuates the average entropy per determiend site.

The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 154 of file sequence/functions/entropy.cpp.

std::map< char, double > base_frequencies ( Sequence const &  seq,
std::string const &  plain_chars 
)

Get the base frequencies of the sites in a Sequence given the base chars.

This returns the relative proportions of the given plain_chars to each other. Typically, the given chars come from either nucleic_acid_codes_plain() or amino_acid_codes_plain(), depending on the dataset.

It is necessary to select those chars on a per-dataset basis, as it is up to the user to define the meaning of those chars.

Definition at line 102 of file sequence/functions/stats.cpp.

std::map< char, double > base_frequencies ( SequenceSet const &  set,
std::string const &  plain_chars 
)

Get the base frequencies of the sites in a SequenceSet given the base chars.

See the Sequence implementation of this function for details.

Definition at line 108 of file sequence/functions/stats.cpp.

std::map<char, double> genesis::sequence::base_frequencies_accumulator ( std::map< char, size_t > const &  sitehistogram,
std::string const &  plain_chars 
)

Local helper function that turns a site histogram into base frequencies.

Definition at line 78 of file sequence/functions/stats.cpp.

std::string consensus_sequence_cavener ( SiteCounts const &  counts,
bool  allow_gaps = true 
)

Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities.

For every site, the nucleotides are sorted by frequency. Then, the following checks are performed:

  1. A single nucleotide is used if its frequency is at least 50% and more than twice as high as the second most frequent nucleotide.
  2. If two nucleotides occur in at leat 75% of the sequences, and rule 1 does not apply, their ambiguity code is used (double-degenerate code).
  3. If none of the above applies, but one of the nucleotides has a count of zero, the (triple-degenerate) code of the other three nucleotides is used.
  4. In all other cases, the code 'N' is used.

The method is meant for finding a consensus sequence in sets of sequences without gaps. This implementation however also allows gaps, which are just treated as a normal character if allow_gaps is true (default). In this case, if any of rules 1-3 applies to a gap character, the result is simply reduced to a gap. That is, whenever a gap is used to form a degenerate code according to the rules, the whole code is reduced to a gap.

If however allow_gaps is false, gap characters are completely ignored from the counting.

The method was described in:

D. R. Cavener, "Comparison of the consensus sequence flanking translational start sites in Drosophila and vertebrates", Nucleic Acids Res., vol. 15, no. 4, 1987.

The method is also used by the Transfac Project:

V. Matys et al., "TRANSFAC: Transcriptional regulation, from patterns to profiles", Nucleic Acids Res., vol. 31, no. 1, pp. 374–378, 2003.

Definition at line 466 of file consensus.cpp.

std::string consensus_sequence_cavener ( SequenceSet const &  sequences,
bool  allow_gaps = true 
)

Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities.

See consensus_sequence_cavener(...) for details. This is merely a wrapper function that takes a SequenceSet instead of a SiteCounts object.

Definition at line 528 of file consensus.cpp.

std::string genesis::sequence::consensus_sequence_template ( SiteCounts const &  counts,
bool const  allow_gaps,
CharSelector const &  char_selector 
)

Local helper function template that handles the common code for the nucleic acid consensus sequence functions.

Definition at line 73 of file consensus.cpp.

std::string consensus_sequence_with_ambiguities ( SiteCounts const &  counts,
double  similarity_factor = 0.9,
bool  allow_gaps = true 
)

Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities.

The function calculates a consensus sequence for nucleic acid codes (ACGT), using their ambiguity codes (e.g., W for "weak" == AT) if the counts (i.e., character frequencies) are similar at a site. It uses similarity_factor to decide which counts are close enough to each other in order to be considered ambiguous.

For example, with similarity_factor == 1.0, only exact matches are used, that is, if two counts are exactly the same. Let count('A') == 42 and count('T') == 42, and both other counts (C and G) be 0, this results in the code W at that site. If however count('T') == 41, only A is used for the site. Thus, with similarity_factor == 1.0, this function behaves very similar to consensus_sequence_with_majorities(), except in cases were two counts are exaclty the same.

On the other hand, with similarity_factor == 0.0, all codes that are present at a site are considered to be ambiguous. That is, if a site contains counts > 0 for A, G and T, the resulting site gets the code D ("not C").

For intermediate values, e.g., the default 0.9, the value is used as a threshold to decide the ambiguities. For example, let count('A') == 42 and count('T') == 38, and both other counts be 0. Then, the allowed deviation from the maximum 42 is 0.9 * 42 = 37.8. Thus, as the count for T is above this value, those two codes are considered ambiguous, resulting in a W at that site.

The optional parameter allow_gaps (default is true) behaves similar to its counterpart in consensus_sequence_with_majorities(). If set to true, the count of the gap character is also considered. As the ambiguity for a gap combined with any other character is still a gap, sites where gap is the most frequent charater or is within the deviation get a gap as result.

If allow_gaps is set to false instead, gaps are not considered. That means, the ambiguities are calculated as if there were no gaps. So even if a site contains mostly gaps, but only a few other characters, those will be used. Solely all-gap sites result in a gap at that site.

Remark: As this function expects nucleic acid codes, the gap character is fixed to '-' here. The ambiguity codes are converted using nucleic_acid_ambiguity_code(). See there for more information on the used codes.

If the provided SiteCounts object does not use nucleic acid codes, or if the similarity_factor is not within the range [ 0.0, 1.0 ], an exception is thrown.

Definition at line 253 of file consensus.cpp.

std::string consensus_sequence_with_ambiguities ( SequenceSet const &  sequences,
double  similarity_factor = 0.9,
bool  allow_gaps = true 
)

Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities.

See consensus_sequence_with_ambiguities(...) for details. This is merely a wrapper function that takes a SequenceSet instead of a SiteCounts object.

Definition at line 324 of file consensus.cpp.

std::string consensus_sequence_with_majorities ( SiteCounts const &  counts,
bool  allow_gaps = true,
char  gap_char = '-' 
)

Calculate the majority rule consensus sequence by using the most frequent character at each site.

The function creates a consensus sequence by using the character at each position that has the highest count (or frequency). It does not assume any character codes. Thus, it works for all kinds of sequence codes, e.g., nucleic acid or amino acid codes.

The optional parameter allow_gaps (default is true) determines whether gaps in the consensus sequence are allowed. By default, if a site consists mostly of gaps, the consensus sequence also contains a gap at that site. If however this option is set to false, the consensus sequence will contain the most frequent non-gap character, even if there are more gaps at this site than the character itself. In other words, if the parameter is set to false, gaps are treated as missing characters instead of another type of character for computing the consensus. The only exception are gaps-only sites; in this case, the resulting sites contain a gap characters even if the parameter is set to false.

The optional parameter gap_char (default value '-') is used for sites where no counts are available (i.e., are all zero), or, if allow_gaps is set to true, for sites that contain mostly gaps.

Furthermore, if two or more characters have the same frequency, the first one is used. That is, the one that appears first in SiteCounts::characters().

For an alternative version of this function that takes those ambiguities into account, see consensus_sequence_with_ambiguities(). Also, for a version of this function that takes a threshold for the character frequencies into account, see consensus_sequence_with_threshold(). However, both of them currently only work for nucleic acid codes (ACGT).

Definition at line 162 of file consensus.cpp.

std::string consensus_sequence_with_majorities ( SequenceSet const &  sequences,
std::string const &  characters,
bool  allow_gaps = true,
char  gap_char = '-' 
)

Calculate the majority rule consensus sequence by using the most frequent character at each site.

See consensus_sequence_with_majorities(...) for details.

This function is merely a wrapper that instead of a SiteCounts objects, takes a SequenceSet object and the set of characters to be used for counting character frequencies in the Sequences. That means, only the provided characters are counted and used for the consensus sequence. This is useful in order to get rid of errors and noise in the Sequences. For example, if you want to build a consensus sequence for a set of sequences with amino acid codes, use amino_acid_codes_plain() for set characters parameter.

Definition at line 212 of file consensus.cpp.

std::string consensus_sequence_with_majorities ( SequenceSet const &  sequences,
bool  allow_gaps = true 
)

Calculate the majority rule consensus sequence by using the most frequent character at each site for nucleic acid codes ACGT.

See consensus_sequence_with_majorities(...) for details.

This function is merely a wrapper that instead of a SiteCounts objects, takes a SequenceSet object consisting of Sequences with nucleic acid codes ACGT and uses '-' for gaps.

Definition at line 237 of file consensus.cpp.

std::string consensus_sequence_with_threshold ( SiteCounts const &  counts,
double  frequency_threshold = 0.6,
bool  allow_gaps = true,
bool  use_ambiguities = true 
)

Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT.

The function calculates a consensus sequence for nucleic acid codes (ACGT). It uses the frequency of characters at each site to determine the consensus. The frequency is relative to the total number of counts at that site, thus, it is a value in the range [ 0.0, 1.0 ].

If the frequency of a character at a site is above the given frequency_threshold, it is used for the consensus. If not, the resulting character depends on use_ambiguities. If use_ambiguities is set to true (default), the sorted frequencies of the characters are added until the threshold is reached, and the ambiguity code for those characters is used. For example, let frequency_threshold == 0.9, count('A') == 42 and count('T') == 42, and both other counts (C and G) be 0. Then, neither A nor T have counts above the threshold, but combined they do, so the result is code W == AT at that site. If however use_ambiguities is false, the mask character X is used for sites that are below the threshold.

Furthermore, if allow_gaps is set to true (default), gaps are counted when determining the threshold and checking whether the frequency is above it. That is, gaps are then treated as just another character at the site. For sites where the gap frequency is needed to reach the threshold, a gap is used as consensus. If allow_gaps is false, however, gaps are not counted for determining the frequency of the other characters. This is similar to the counterpart in consensus_sequence_with_majorities(). Solely sites that are gaps-only result in a gap char for the consensus then.

For frequency_threshold < 0.5, it may happen that more than one character has a frequency above the threshold. In such cases, the most frequent character is used (or, if they have exaclty the same counts, they are used in the order ACGT). This is in line with the behaviour of consensus_sequence_with_majorities(). Usually, however, the threshold is above 0.5 anyway, as this gives more meaningful results. If you want to use ambiguity characters for low frequency characters, you can use consensus_sequence_with_ambiguities() instead.

An extreme case is a frequency_threshold of 1.0. In this case, for sites which only have one character, this one is directly used in the consensus. Sites with multiple different characters result in the ambiguity code of all occuring characters at that site. Thus, the function then behaves similar to consensus_sequence_with_ambiguities() with a similarity_factor of 0.0.

The other extrem case is a frequency_threshold of 0.0. In this case, the function simply uses the most frequent character per site, as it always fulfills the threshold. As said above, if then more than one character has exactly the same frequency, they are used in the order ACGT, thus the function then behaves similar to consensus_sequence_with_majorities().

Remark: As this function expects nucleic acid codes, the gap character is fixed to '-' here, and the mask character to 'X'. The ambiguity codes are converted using nucleic_acid_ambiguity_code(). See there for more information on the used codes.

If the provided SiteCounts object does not use nucleic acid codes, or if the frequency_threshold is not within the range [ 0.0, 1.0 ], an exception is thrown.

Definition at line 352 of file consensus.cpp.

std::string consensus_sequence_with_threshold ( SequenceSet const &  sequences,
double  frequency_threshold = 0.6,
bool  allow_gaps = true,
bool  use_ambiguities = true 
)

Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT.

See consensus_sequence_with_ambiguities(...) for details. This is merely a wrapper function that takes a SequenceSet instead of a SiteCounts object.

Definition at line 432 of file consensus.cpp.

size_t count_chars ( SequenceSet const &  set,
std::string const &  chars 
)

Count the number of occurrences of the given chars within the sites of the SequenceSet.

This function can be used to count e.g. gaps or ambiguous characters in sequences. For presettings of usable chars, see the functions nucleic_acid_codes_... and amino_acid_codes_.... The chars are treated case-insensitive.

If chars contains invalid (non-standard ASCII) characters, an std::invalid_argument exception is thrown.

Definition at line 118 of file sequence/functions/stats.cpp.

bool genesis::sequence::CountPairComparator ( CountPair const &  lhs,
CountPair const &  rhs 
)

Local helper function to sort a CountPair.

The comparatorfirst sorts by count, and for equal counts, sorts alphanumercially by the char. Thus, gaps are always first (their ascii code is smaller than all letters).

Definition at line 63 of file consensus.cpp.

void filter_by_label_list ( SequenceSet &  set,
std::unordered_set< std::string > const &  labels,
bool  invert = false 
)

Remove all those Sequences from a SequenceSet whose labels are in the given list.

If invert is set to true, it does the same inverted: it removes all Sequences except those whose label is in the list.

Definition at line 255 of file labels.cpp.

void filter_max_sequence_length ( SequenceSet &  set,
size_t  max_length 
)

Remove all Sequences from the SequenceSet whose length is above the given max_length.

See also filter_min_sequence_length() and filter_min_max_sequence_length().

Definition at line 416 of file sequence/functions/functions.cpp.

void filter_min_max_sequence_length ( SequenceSet &  set,
size_t  min_length,
size_t  max_length 
)

Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length.

This function has the same effect as calling filter_min_sequence_length() and filter_max_sequence_length(), but does it in one iteration over the SequenceSet.

Definition at line 428 of file sequence/functions/functions.cpp.

void filter_min_sequence_length ( SequenceSet &  set,
size_t  min_length 
)

Remove all Sequences from the SequenceSet whose length is below the given min_length.

See also filter_max_sequence_length() and filter_min_max_sequence_length().

Definition at line 404 of file sequence/functions/functions.cpp.

Sequence const * find_sequence ( SequenceSet const &  set,
std::string const &  label 
)

Return a pointer to a Sequence with a specific label, or nullptr iff not found.

Definition at line 52 of file labels.cpp.

utils::Bitvector find_sites ( Sequence const &  seq,
std::string const &  chars 
)

Find sites by character and mark them in a Bitvector.

The function iterates the sites of a Sequence and checkes whether the char at a site is in the provided set of chars (case insensitive). If so, the resulting Bitvector is set to true at that position.

Definition at line 60 of file sequence/functions/functions.cpp.

utils::Bitvector find_sites ( Sequence const &  seq,
utils::CharLookup< bool > const &  chars 
)

Find sites by character and mark them in a Bitvector.

The function iterates the sites of a Sequence and checkes whether the char at a site is set to true in the provided lookup of chars. If so, the resulting Bitvector is set to true at that position.

Definition at line 71 of file sequence/functions/functions.cpp.

size_t gap_site_count ( SiteCounts const &  counts)

Definition at line 157 of file sequence/functions/stats.cpp.

utils::Bitvector gap_sites ( Sequence const &  seq,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Return a Bitvector that is true where the Sequence has a gap and false where not.

The gap_chars are used case-insensitively to determine what is considerted to be a gap. By default, nucleic_acid_codes_undetermined() are used, but any other set of characters is allowed.

Definition at line 82 of file sequence/functions/functions.cpp.

utils::Bitvector gap_sites ( SequenceSet const &  set,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Return a Bitvector that is true where all Sequences in the SequenceSet have a gap and false where not, that is, where at least on Sequence is not a gap.

The gap_chars are used case-insensitively to determine what is considerted to be a gap. By default, nucleic_acid_codes_undetermined() are used, but any other set of characters is allowed.

Definition at line 87 of file sequence/functions/functions.cpp.

double gapyness ( SequenceSet const &  set,
std::string const &  gap_chars 
)

Return the "gapyness" of the Sequences, i.e., the proportion of gap chars and other completely undetermined chars to the total length of all sequences.

This function returns a value in the interval 0.0 (no gaps and undetermined chars at all) and 1.0 (all chars are undetermined). See nucleic_acid_codes_undetermined() and amino_acid_codes_undetermined() for presettings of gap character that can be used here depending on the data set type. The chars are treated case-insensitive. In the special case that there are no sequences or sites, 0.0 is returned.

Definition at line 144 of file sequence/functions/stats.cpp.

size_t guess_sequence_abundance ( Sequence const &  sequence)

Guess the abundance of a Sequence, using it's label.

The function accepts two patterns of reporting abundances via the label() of a Sequence:

  • Appended via underscore: name_123. In this case, the number has to be the last in the label, that is, no other text may follow.
  • Using the format [;]size=123[;]. The semicoli are optional.

If neither of them is found, a default abundance of 1 is returned.

Definition at line 70 of file labels.cpp.

size_t guess_sequence_abundance ( std::string const &  label)

Guess the abundance of a Sequence, given it's label.

This is the same as guess_sequence_abundance( Sequence const& ), but takes just the label instead of the Sequence object.

Definition at line 75 of file labels.cpp.

bool has_unique_labels ( SequenceSet const &  set,
bool  case_sensitive = true 
)

Return true iff all labels of the Sequences in the SequenceSet are unique.

The optional parameter case_sensitive controls how labels are compared. Default is true, that is, Sequences are compared case-sensitively.

Definition at line 133 of file labels.cpp.

bool has_valid_label ( Sequence const &  seq)

Check whether a Sequence has a valid label.

This might be important for printing the Sequence to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label. See sanitize_label() for a function that replaces all invalid characters of the label by underscores.

Definition at line 208 of file labels.cpp.

bool has_valid_labels ( SequenceSet const &  set)

Check whether all Sequences in a SequenceSet have valid labels.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label. See sanitize_labels() for a function that replaces all invalid characters of the labels by underscores.

Definition at line 213 of file labels.cpp.

bool is_alignment ( SequenceSet const &  set)

Return true iff all Sequences in the SequenceSet have the same length.

Definition at line 164 of file sequence/functions/functions.cpp.

bool is_valid_label ( std::string const &  label)

Check whether a given string is a valid label for a Sequence.

While we can work with any form of label (as long as it is a string), most file formats and consequently most programs that read them restrict the set of valid characters for labels of sequences. We thus provide this function, which uses the most common interpretation of valid labels.

A label is valid if its characters have a graphical representation (i.e., isgraph() is true) and if non of these characters occurs:

:,();[]'

Thus, all whitespaces, control characters, and the listed special characters are invalid. See sanitize_label() for a function that replaces all invalid characters of the label by underscores.

Definition at line 197 of file labels.cpp.

std::string kmer_string_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Return the sequence spitted into overlapping k-mers.

The function takes the sequence and splits it into k-mers according to settings, using a space char as delimiter between k-mers, with overlap.

For example, the sequence ACGTACGT with k == 3 becomes

ACG CGT GTA TAC ACG CGT

The naming of the function follows

D. Kimothi, A. Soni, P. Biyani, and J. M. Hogan, “Distributed Representations for Biological Sequence Analysis,” CoRR, vol. abs/1608.0, 2016.

See there for details.

Definition at line 426 of file signatures.cpp.

void kmer_string_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out 
)

Print the sequence spitted into overlapping k-mers.

This is identical to kmer_string_overlapping( Sequence const&, SignatureSpecifications const& ), but prints directly to a stream, which is better for processing large files. After the k-mer sequence, a new line character is printed.

Definition at line 435 of file signatures.cpp.

void genesis::sequence::kmer_string_overlapping_line ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out 
)

Local helper function that writes an overlapping kmer string to a stream.

Definition at line 387 of file signatures.cpp.

void genesis::sequence::kmer_string_single_kmer ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
size_t  start,
std::ostream &  out 
)

Local helper function that writes one kmer string to a stream.

Definition at line 357 of file signatures.cpp.

std::vector< std::string > kmer_strings_non_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Return the sequence spitted into a set of non-overlapping k-mers.

The function takes the sequence and splits it into k-mers according to settings, using a space char as delimiter between k-mers, without overlap.

For example, the sequence ACGTACGTACGT with k == 3 becomes

ACG TAC GTA CGT
CGT ACG TAC
GTA CGT ACG

The naming of the function follows

D. Kimothi, A. Soni, P. Biyani, and J. M. Hogan, “Distributed Representations for Biological Sequence Analysis,” CoRR, vol. abs/1608.0, 2016.

See there for details.

Definition at line 448 of file signatures.cpp.

void kmer_strings_non_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out 
)

Print the sequence spitted into non-overlapping k-mers.

This is identical to kmer_strings_non_overlapping( Sequence const&, SignatureSpecifications const& ), but prints directly to a stream, which is better for processing large files. After each k-mer sequence, a new line character is printed.

Definition at line 468 of file signatures.cpp.

void genesis::sequence::kmer_strings_non_overlapping_line ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out,
size_t  offset 
)

Local helper function that does one line of a non overlapping kmer string.

Definition at line 411 of file signatures.cpp.

std::unordered_set< std::string > labels ( SequenceSet const &  set)

Return a set of all labels of the SequenceSet.

Definition at line 62 of file labels.cpp.

size_t longest_sequence_length ( SequenceSet const &  set)

Return the length of the longest Sequence in the SequenceSet.

Definition at line 146 of file sequence/functions/functions.cpp.

void merge_duplicate_sequences ( SequenceSet &  set,
MergeDuplicateSequencesCountPolicy  count_policy = MergeDuplicateSequencesCountPolicy::kDiscard,
std::string const &  counter_prefix = "_" 
)

Merge all Sequences in a SequenceSet that have identical sites.

The merging is done by removing all but the first Sequence with identical sites. That means, the resulting "representative" of a set of merged Sequences has the label of the original Sequence that was first in the SequenceSet.

Using the MergeDuplicateSequencesCountPolicy, it is possible to store the counts of the Sequences (i.e., how often they appeard before merging) within the label of the Sequence, separated by counter_prefix. With the default settings, the count is appended to the label, separated by an underscore.

Definition at line 304 of file sequence/functions/functions.cpp.

char normalize_amino_acid_code ( char  code,
bool  accept_degenerated = true 
)

Normalize an amino acid code.

That is, make it upper case and replace all undetermined chars by -. See amino_acid_codes_undetermined() for a list of the latter.

If accept_degenerated is set to true (default), degenerated chars are just put to upper case, but otherwise left as they are. If set to false, an exception is thrown if a degenerated char is encountered. See amino_acid_codes_degenerated() for their list.

Lastly, an exception is also thrown for non amino acid codes, that is all chars that are not part of amino_acid_codes_all().

Definition at line 414 of file codes.cpp.

void normalize_amino_acid_codes ( Sequence &  sequence,
bool  accept_degenerated = true 
)

Call normalize_amino_acid_code() for each site of the Sequence.

See there for details.

Definition at line 386 of file sequence/functions/functions.cpp.

void normalize_amino_acid_codes ( SequenceSet &  sequence_set,
bool  accept_degenerated = true 
)

Call normalize_amino_acid_code() for each site of all Sequences in the SequenceSet.

See there for details.

Definition at line 393 of file sequence/functions/functions.cpp.

std::string normalize_code_alphabet ( std::string const &  alphabet)

Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.

For example, when given a set of nucleic acid codes like "aGtc", the function returns "ACGT". This is useful to get consistent codes in functions that accept a user defined code alphabet.

Definition at line 346 of file codes.cpp.

char normalize_nucleic_acid_code ( char  code,
bool  accept_degenerated = true 
)

Normalize a nucleic acide code.

That is, make it upper case, replace U by T, replace all undetermined chars by -. See nucleic_acid_codes_undetermined() for a list of the latter.

If accept_degenerated is set to true (default), degenerated chars are just put to upper case, but otherwise left as they are. If set to false, an exception is thrown if a degenerated char is encountered. See nucleic_acid_codes_degenerated() for their list.

Lastly, an exception is also thrown for non nucleic acid codes, that is all chars that are not part of nucleic_acid_codes_all().

Definition at line 355 of file codes.cpp.

void normalize_nucleic_acid_codes ( Sequence &  sequence,
bool  accept_degenerated = true 
)

Call normalize_nucleic_acid_code() for each site of the Sequence.

See there for details.

Definition at line 372 of file sequence/functions/functions.cpp.

void normalize_nucleic_acid_codes ( SequenceSet &  sequence_set,
bool  accept_degenerated = true 
)

Call normalize_nucleic_acid_code() for each site of all Sequences in the SequenceSet.

See there for details.

Definition at line 379 of file sequence/functions/functions.cpp.

std::string nucleic_acid_ambiguities ( char  code)

Return the possible ambiguous nucleic acid codes for a given code char.

The codes are resolved as follows:

'A' ==> "A"
'C' ==> "C"
'G' ==> "G"
'T' ==> "T"
'U' ==> "T"

'W' ==> "AT"
'S' ==> "CG"
'M' ==> "AC"
'K' ==> "GT"
'R' ==> "AG"
'Y' ==> "CT"

'B' ==> "CGT"
'D' ==> "AGT"
'H' ==> "ACT"
'V' ==> "ACG"

'N' ==> "ACGT"
'O' ==> "-"
'X' ==> "-"
'.' ==> "-"
'-' ==> "-"
'?' ==> "-"

The code char is treated case-insensitive. If the given code char is not valid, an std::out_of_range exception is thrown.

See nucleic_acid_ambiguity_code() for a reverse version of this function. It is however not exactly the reverse, as some degenerated codes are mapped to the gap char. Thus, this function is not injective.

Definition at line 604 of file codes.cpp.

char nucleic_acid_ambiguity_code ( std::string  codes)

Return the nucleic acid code that represents all given codes.

The codes are resolved as follows:

"A"    ==> 'A'
"C"    ==> 'C'
"G"    ==> 'G'
"T"    ==> 'T'

"AT"   ==> 'W'
"CG"   ==> 'S'
"AC"   ==> 'M'
"GT"   ==> 'K'
"AG"   ==> 'R'
"CT"   ==> 'Y'

"CGT"  ==> 'B'
"AGT"  ==> 'D'
"ACT"  ==> 'H'
"ACG"  ==> 'V'

"ACGT" ==> 'N'
"-"    ==> '-'

The given codes are treated case-insensitive and order-independent. For example, given "tCgG", the function still returns 'B'. However, if any of the given codes is not valid, an std::out_of_range exception is thrown.

See nucleic_acid_ambiguities() for the reverse of this function.

Definition at line 613 of file codes.cpp.

std::string nucleic_acid_codes_all ( )

Return all valid nucleic acid codes. Those are "ACGTUWSMKRYBDHVNOX.-?".

Definition at line 309 of file codes.cpp.

std::string nucleic_acid_codes_degenerated ( )

Return all degenerated nucleic acid codes. Those are "WSMKRYBDHV".

Definition at line 299 of file codes.cpp.

std::string nucleic_acid_codes_plain ( )

Return all plain nucleic acid codes. Those are "ACGTU".

Definition at line 294 of file codes.cpp.

std::string nucleic_acid_codes_undetermined ( )

Return all undetermined nucleic acid codes. Those are "NOX.-?".

Definition at line 304 of file codes.cpp.

std::map< char, utils::Color > nucleic_acid_colors ( )

Return a map of Colors for each nucleic acid code.

This function gives a Color for each nucleic acid code.

Definition at line 572 of file codes.cpp.

std::string nucleic_acid_name ( char  code)

Get the name of a nucleic acid given its IUPAC code.

The codes are translated as follows:

A Adenine
C Cytosine
G Guanine
T Thymine
U Uracil
W Weak
S Strong
M aMino
K Keto
R puRine
Y pYrimidine
B not A
D not C
H not G
V not T
N any
O omitted
X masked
. gap
- gap
? gap

The code char is treated case-insensitive. If the given code char is not valid, an std::out_of_range exception is thrown.

Definition at line 586 of file codes.cpp.

std::map< char, std::string > nucleic_acid_text_colors ( )

Return a map of text colors for each nucleic acid code.

This function gives a color name usable for utils::Style for each nucleic acid code. The return value of this function can for example be used in sequence::print_color() function.

Definition at line 562 of file codes.cpp.

bool genesis::sequence::operator& ( SiteEntropyOptions  lhs,
SiteEntropyOptions  rhs 
)
inline

And-operator to check whether a SiteEntropyOptions is set.

Typical usage:

SiteEntropyOptions options;
if( options & SiteEntropyOptions::kWeighted ) {
    // Do stuff...
}

Use the or-operator in order to set and combine options.

Definition at line 190 of file sequence/functions/entropy.hpp.

std::ostream & operator<< ( std::ostream &  out,
Sequence const &  seq 
)

Print a Sequence to an ostream in the form "label: sites".

As this is meant for quickly having a look at the Sequence, only the first 100 sites are printed. If you need all sites, or more settings like color, use SimplePrinter.

Definition at line 449 of file sequence/functions/functions.cpp.

std::ostream & operator<< ( std::ostream &  out,
SequenceSet const &  set 
)

Print a SequenceSet to an ostream in the form "label: sites".

As this is meant for quickly having a look at the SequenceSet, only the first 10 Sequences and the first 100 sites of each are printed. If you need all sequences and sites, or more settings like color, use SimplePrinter.

Definition at line 458 of file sequence/functions/functions.cpp.

SiteEntropyOptions genesis::sequence::operator| ( SiteEntropyOptions  lhs,
SiteEntropyOptions  rhs 
)
inline

Or-operator to combine two SiteEntropyOptionss.

Typical usage:

auto options = SiteEntropyOptions::kWeighted | SiteEntropyOptions::kNormalized;

Use the and-operator in order to check whether an option is set.

Definition at line 149 of file sequence/functions/entropy.hpp.

SiteEntropyOptions& genesis::sequence::operator|= ( SiteEntropyOptions &  lhs,
SiteEntropyOptions  rhs 
)
inline

Or-assignment-operator to combine two SiteEntropyOptionss.

Typical usage:

SiteEntropyOptions options;
options |= SiteEntropyOptions::kWeighted;

Use the and-operator in order to check whether an option is set.

Definition at line 168 of file sequence/functions/entropy.hpp.

void relabel_md5 ( Sequence &  seq)

Relabel the Sequence using the MD5 hash digest of its sites.

Definition at line 180 of file labels.cpp.

void relabel_md5 ( SequenceSet &  set)

Relabel all Sequences in the SequenceSet using the MD5 hash digest of the sites.

If there are duplicate Sequences, this function will lead to multiple Sequences with the same name, which might be an issue for downstream programs that expect unique labels. See has_unique_labels() to check this.

Definition at line 186 of file labels.cpp.

void relabel_sha1 ( Sequence &  seq)

Relabel the Sequence using the SHA1 hash digest of its sites.

Definition at line 154 of file labels.cpp.

void relabel_sha1 ( SequenceSet &  set)

Relabel all Sequences in the SequenceSet using the SHA1 hash digest of the sites.

If there are duplicate Sequences, this function will lead to multiple Sequences with the same name, which might be an issue for downstream programs that expect unique labels. See has_unique_labels() to check this.

Definition at line 160 of file labels.cpp.

void relabel_sha256 ( Sequence &  seq)

Relabel the Sequence using the SHA256 hash digest of its sites.

Definition at line 167 of file labels.cpp.

void relabel_sha256 ( SequenceSet &  set)

Relabel all Sequences in the SequenceSet using the SHA256 hash digest of the sites.

If there are duplicate Sequences, this function will lead to multiple Sequences with the same name, which might be an issue for downstream programs that expect unique labels. See has_unique_labels() to check this.

Definition at line 173 of file labels.cpp.

void remove_all_gaps ( Sequence &  seq,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Remove all gap characters from the sites of the Sequence.

This function is an alias for remove_characters(), which by default uses the gap sites of nucleic_acid_codes_undetermined().

Definition at line 244 of file sequence/functions/functions.cpp.

void remove_all_gaps ( SequenceSet &  set,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Remove all gap characters from the sites of the Sequences in the SequenceSet.

This function is an alias for remove_characters(), which by default uses the gap sites of nucleic_acid_codes_undetermined().

Definition at line 249 of file sequence/functions/functions.cpp.

void remove_characters ( Sequence &  seq,
std::string const &  search 
)

Remove all of the characters in search from the sites of the Sequence.

Definition at line 227 of file sequence/functions/functions.cpp.

void remove_characters ( SequenceSet &  set,
std::string const &  search 
)

Remove all of the characters in search from the sites of the Sequences in the SequenceSet.

Definition at line 237 of file sequence/functions/functions.cpp.

void remove_gap_sites ( SequenceSet &  set,
std::string const &  gap_chars 
)

Remove all sites that only contain gap characters from the SequenceSet.

Definition at line 221 of file sequence/functions/functions.cpp.

void remove_sites ( Sequence &  seq,
utils::Bitvector  sites 
)

Remove all sites from a Sequence where the given Bitvector is true, and keep all others.

The Bitvector needs to have the same size as the Sequence, otherwise an expection is thrown.

This function is for example useful in combination with gap_sites().

Definition at line 183 of file sequence/functions/functions.cpp.

void remove_sites ( SequenceSet &  set,
utils::Bitvector  sites 
)

Remove all sites from all Sequences in a SequenceSet where the given Bitvector is true, and keep all others.

The Bitvector and all Sequences need to have the same size, otherwise an expection is thrown. This check is done before any Sequence is changed. Thus, if the function throws for this reason, the Sequences are left unchanged.

This function is for example useful in combination with gap_sites().

Definition at line 205 of file sequence/functions/functions.cpp.

void replace_characters ( Sequence &  seq,
std::string const &  search,
char  replacement 
)

Replace all occurences of the chars in search by the replace char, for all sites in the given Sequence.

The function is case sensitive. Thus, you need to use both cases for the search string if you are unsure. The replace char is always used as is, independent of the case of the matching search char.

Definition at line 254 of file sequence/functions/functions.cpp.

void replace_characters ( SequenceSet &  set,
std::string const &  search,
char  replacement 
)

Replace all occurences of the chars in search by the replace char, for all sites in the Sequences in the given SequenceSet.

The function is case sensitive. Thus, you need to use both cases for the search string if you are unsure. The replace char is always used as is, independent of the case of the matching search char.

Definition at line 259 of file sequence/functions/functions.cpp.

void replace_t_with_u ( Sequence &  seq)

Replace all occurrences of T by U in the sites of the Sequence.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case t is replaced by lower case u, and upper case T by upper case U.

Definition at line 285 of file sequence/functions/functions.cpp.

void replace_t_with_u ( SequenceSet &  set)

Replace all occurrences of T by U in the sites of all Sequences in the SequenceSet.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case t is replaced by lower case u, and upper case T by upper case U.

Definition at line 297 of file sequence/functions/functions.cpp.

void replace_u_with_t ( Sequence &  seq)

Replace all occurrences of U by T in the sites of the Sequence.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case u is replaced by lower case t, and upper case U by upper case T.

Definition at line 266 of file sequence/functions/functions.cpp.

void replace_u_with_t ( SequenceSet &  set)

Replace all occurrences of U by T in the sites of all Sequences in the SequenceSet.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case u is replaced by lower case t, and upper case U by upper case T.

Definition at line 278 of file sequence/functions/functions.cpp.

std::string reverse_complement ( std::string const &  sequence,
bool  accept_degenerated = true 
)

Get the reverse complement of a nucleic acid sequence.

That is, reverse the string and flip A with T and C with G. Gap characters are normalized to -, and an exception is thrown for invalid characters.

If furthermore accept_degenerated is true (default), degenerated codes are also flipped. For example M == AC becomes K == TG, W == AT stays the same, and B == CGT becomes V = GCA. If set to false, an exception is thrown when degenerated chars are found.

Definition at line 487 of file codes.cpp.

std::string sanitize_label ( std::string const &  label)

Sanitize a label by replacing all invalid characters with underscores.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label.

Definition at line 223 of file labels.cpp.

void sanitize_label ( Sequence &  seq)

Sanitize a label by replacing all invalid characters with underscores.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label.

Definition at line 239 of file labels.cpp.

void sanitize_labels ( SequenceSet &  set)

Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with underscores.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label.

Definition at line 244 of file labels.cpp.

std::vector<double> genesis::sequence::signature_complementarity_frequencies_helper ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
Combinator  combinator 
)

Local helper function that returns a vector where the frequencies of the non-palindromic kmers are combined using a functor.

Definition at line 210 of file signatures.cpp.

std::vector< size_t > signature_counts ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Count the occurences of k-mers in the sequence according to the settings.

The function returns a vector that contains the count for each k-mer that can be build from the characters in the alphabet, in the order given by SignatureSpecifications::kmer_list().

Definition at line 57 of file signatures.cpp.

std::vector< double > signature_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the frequencies of occurences of k-mers in the sequence according to the settings.

The function simply calculates the frequencies as the normalized values of signature_counts().

Definition at line 117 of file signatures.cpp.

std::vector< double > signature_frequency_ratios_1 ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the ratio 1 signature of a sequence.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 296 of file signatures.cpp.

std::vector< double > signature_frequency_ratios_2 ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the ratio 2 signature of a sequence.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 316 of file signatures.cpp.

std::vector< double > signature_jensen_shannon ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the Jensen-Shannon (JS) signature of a sequence.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

using details of

J. Lin, “Divergence Measures Based on the Shannon Entropy,” IEEE Trans. Inf. Theory, vol. 37, no. 1, pp. 145–151, 1991.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 334 of file signatures.cpp.

std::vector< double > signature_maximal_complementarity_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the signature of a sequence that uses the maximum frequency of reverse complement k-mers.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 257 of file signatures.cpp.

std::vector< double > signature_minimal_complementarity_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the signature of a sequence that uses the minimum frequency of reverse complement k-mers.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 244 of file signatures.cpp.

std::vector< size_t > signature_ranks ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the rank signature of a sequence according to the settings.

That is, standard ranking is applied to the k-mer counts of the Sequence.

Definition at line 183 of file signatures.cpp.

std::vector< double > signature_reverse_identity_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the signature of a sequence that uses only the frequencies of k-mers whose reverse complement is the k-mer itself.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes k-mers where the reverse complement is a different k-mer.

Definition at line 270 of file signatures.cpp.

std::vector< size_t > signature_symmetrized_counts ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the symmetrized counts of the sequence according to the settings.

The function uses signature_counts(), and sums up the counts of k-mers that are reverse complements of each other.

Definition at line 159 of file signatures.cpp.

std::vector< double > signature_symmetrized_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the symmetrized counts of the sequence according to the settings.

The function uses signature_frequencies(), and sums up the counts of k-mers that are reverse complements of each other.

Definition at line 169 of file signatures.cpp.

std::vector<T> genesis::sequence::signature_symmetrized_helper ( std::vector< T > const &  kmers,
SignatureSpecifications const &  settings 
)

Local helper function that adds up the values for reverse complement k-mers.

Definition at line 142 of file signatures.cpp.

std::vector< size_t > signature_symmetrized_ranks ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the symmetrized rank signature of a sequence according to the settings.

That is, standard ranking is applied to the symmetrized (combined reverse complement) k-mer counts of the Sequence.

Definition at line 192 of file signatures.cpp.

double site_entropy ( SiteCounts const &  counts,
size_t  site_index,
SiteEntropyOptions  options = SiteEntropyOptions::kDefault 
)

Calculate the entropy at one site of a SiteCounts object.

The entropy $ H $ (uncertainty) at site $ i $ (= site_idx) is calculated as $ H_{i}=-\sum f_{{c,i}}\times \log _{2}f_{{c,i}} $, where $ f_{c,i} $ is the relative frequency of character $ c $ at site $ i $, summed over all characters in the SiteCounts object.

The function additionally takes optional flags to refine the calculation, see SiteEntropyOptions for their explanation.

Definition at line 49 of file sequence/functions/entropy.cpp.

std::map< char, size_t > site_histogram ( Sequence const &  seq)

Get a histogram of the occurrences of particular sites, given a Sequence.

This gives the raw counts of how often each site (character) appears in the Sequence. See base_frequencies() for the relative version of this function.

Definition at line 52 of file sequence/functions/stats.cpp.

std::map< char, size_t > site_histogram ( SequenceSet const &  set)

Get a histogram of the occurrences of particular sites, given a SequenceSet.

This gives the raw counts of how often each site (character) appears in the whole set. See base_frequencies() for the relative version of this function.

Definition at line 60 of file sequence/functions/stats.cpp.

double site_information ( SiteCounts const &  counts,
size_t  site_index,
bool  use_small_sample_correction = false,
SiteEntropyOptions  options = SiteEntropyOptions::kDefault 
)

Calculate the information content at one site of a SiteCounts object.

The information content $ R $ at site $ i $ (= site_index) is calculated as $ R_{i} = \log_{2}( s ) - (H_{i}+e_{n}) $.

Here, $ s $ is the number of possible characters in the sequences (usually, 4 for nucleic acids and 20 for amino acids), which is taken from the characters() used in the SiteCounts object. Furthermore, $ H_{i} $ is the site_entropy() at the given site.

The optional term $ e_{n} $ is the small-sample correction, calculated as $ e_{n}={\frac{1}{\ln {2}}}\times {\frac{s-1}{2n}} $, with $ n $ being the number of sequences. It is only used if use_small_sample_correction is set to true (default is false).

The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 113 of file sequence/functions/entropy.cpp.

void swap ( SequenceSet &  lhs,
SequenceSet &  rhs 
)

Definition at line 42 of file sequence_set.cpp.

size_t total_length ( SequenceSet const &  set)

Return the total length (sum) of all Sequences in the SequenceSet.

Definition at line 155 of file sequence/functions/functions.cpp.

bool validate_chars ( SequenceSet const &  set,
std::string const &  chars 
)

Returns true iff all Sequences only consist of the given chars.

For presettings of usable chars, see the functions nucleic_acid_codes_... and amino_acid_codes_.... For example, to check whether the sequences are nucleic acids, use nucleic_acid_codes_all(). The chars are treated case-insensitive.

If chars contains invalid (non-standard ASCII) characters, an std::invalid_argument exception is thrown.

Definition at line 121 of file sequence/functions/functions.cpp.

Typedef Documentation

using CountPair = std::pair< size_t, char >

Helper struct to store characters sorted by their count.

Definition at line 55 of file consensus.cpp.

Enumeration Type Documentation

Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences.

This allows to remove duplicate sequences while still keeping track of how many there were before merging them.

Enumerator
kDiscard 

The counts are discarded.

kAppendToLabel 

The counts are appended to the sequence label, separated by the counter_prefix.

Definition at line 251 of file sequence/functions/functions.hpp.

enum SiteEntropyOptions : unsigned char
strong

Option flags to refine the calculation of site_entropy().

The flags can be combined via the binary or operator |:

auto flags = SiteEntropyOptions::kIncludeGaps | SiteEntropyOptions::kNormalized;

For checking whether a partcular option is set, use the binary and operator &:

if( flags & SiteEntropyOptions::kIncludeGaps ) {
    // ...
}

The option flags can be used with all functions that calculate the entropy. They are applied on a per-site basis; i.e., they are used for calculating the site_entropy(), which is the basis for higher level entropy functions like absolute_entropy() and averaged_entropy().

Enumerator
kDefault 

Default option, simply calculate the site entropy using the characters used in the SiteCounts object.

kIncludeGaps 

In addition to the characters of the SiteCounts object, use the undetermined and gap characters.

With this option, an additional term is added to the entropy, using the "rest" probability of the site. The counts of all characters at a site in the SiteCounts do not always add up to the number of sequences that have been added. In cases where a Sequence contains gaps or characters that are not set in the SiteCounts object, those counts are simply ignored when processing the Sequence and counting its sites.

Using this ignored rest, an additional entropy term is calculated and added to the total entropy, if this option is used.

For example, if the SiteCounts object counts all occurences of A, C, G and T, and a Sequence added to it contains a gap - or a degenerated character like W, those character are ignored in the counts. The total number of those characters is then:

auto gap_count = counts.added_sequences_count()
               - counts.count_of( 'A', site_idx )
               - counts.count_of( 'C', site_idx )
               - counts.count_of( 'G', site_idx )
               - counts.count_of( 'T', site_idx );

This number is then used as an additional summand for the entropy: $ H_{i} $ += $ -f_{{-,i}}\times \log _{2}f_{{-,i}} $, with $ f_{-,i} = $ gap_count/counts.added_sequences_count() being the relative frequency of those gaps and undetermined characters.

kWeighted 

Weight the entropy using the summed relative frequencies of the characters.

The entropy per site depends on the frequencies of the chracters. However, per default, gaps and other undetermined characters (those which are not used in the SiteCounts object) are igonred. Thus, the entropy for sites that contain mostly gaps might still have quite a high value. In cases where mostly-gap sites shall add little to the total entropy, this option can be used to reduce their influence. The site_entropy() is then weighted using the sum of the frequencies of the determined sites.

For example, if a site has fractions of 0.1 for all four characters A, C, G and T, the resulting entropy is weighted with a factor of 0.4.

See also SiteEntropyOptions::kIncludeGaps for a related option to take those into accoount.

kNormalized 

Normalize the resulting entropy using the maximum entropy possible.

This option results in entropy values in the range [ 0.0, 1.0 ]. This is achieved by dividing the entropy by the maxmimal value that is possible given the used characters of the SiteCounts object. For example, with the four characters A, C, G and T, the entropy is divided by $ log_2{(4)} $.

If additionally the SiteEntropyOptions::kIncludeGaps flag is set, the divisor is calculated using one additional value, e.g., $ log_2{(4+1)} $.

Definition at line 70 of file sequence/functions/entropy.hpp.

Variable Documentation

const std::unordered_map<char, std::string> amino_acid_code_to_name
static

Definition at line 76 of file codes.cpp.

const std::map<char, utils::Color> amino_acid_colors_map
static

Definition at line 200 of file codes.cpp.

const std::map<char, std::string> amino_acid_text_colors_map
static

Definition at line 140 of file codes.cpp.

const std::unordered_map<char, std::string> nucleic_acid_ambiguity_char_map
static
Initial value:
= {
{ 'A', "A" },
{ 'C', "C" },
{ 'G', "G" },
{ 'T', "T" },
{ 'U', "T" },
{ 'W', "AT" },
{ 'S', "CG" },
{ 'M', "AC" },
{ 'K', "GT" },
{ 'R', "AG" },
{ 'Y', "CT" },
{ 'B', "CGT" },
{ 'D', "AGT" },
{ 'H', "ACT" },
{ 'V', "ACG" },
{ 'N', "ACGT" },
{ 'O', "-" },
{ 'X', "-" },
{ '.', "-" },
{ '-', "-" },
{ '?', "-" }
}

Definition at line 237 of file codes.cpp.

const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
static
Initial value:
= {
{ "A", 'A' },
{ "C", 'C' },
{ "G", 'G' },
{ "T", 'T' },
{ "AT", 'W' },
{ "CG", 'S' },
{ "AC", 'M' },
{ "GT", 'K' },
{ "AG", 'R' },
{ "CT", 'Y' },
{ "CGT", 'B' },
{ "AGT", 'D' },
{ "ACT", 'H' },
{ "ACG", 'V' },
{ "ACGT", 'N' },
{ "-", '-' }
}

Definition at line 264 of file codes.cpp.

const std::unordered_map<char, std::string> nucleic_acid_code_to_name
static
Initial value:
= {
{ 'A', "Adenine" },
{ 'C', "Cytosine" },
{ 'G', "Guanine" },
{ 'T', "Thymine" },
{ 'U', "Uracil" },
{ 'W', "Weak" },
{ 'S', "Strong" },
{ 'M', "aMino" },
{ 'K', "Keto" },
{ 'R', "puRine" },
{ 'Y', "pYrimidine" },
{ 'B', "not A" },
{ 'D', "not C" },
{ 'H', "not G" },
{ 'V', "not T" },
{ 'N', "any" },
{ 'O', "omitted" },
{ 'X', "masked" },
{ '.', "gap" },
{ '-', "gap" },
{ '?', "gap" }
}

Definition at line 49 of file codes.cpp.

const std::map<char, utils::Color> nucleic_acid_colors_map
static
Initial value:
= {
{ 'A', { 1.0, 0.0, 0.0 } },
{ 'C', { 0.0, 1.0, 0.0 } },
{ 'G', { 1.0, 1.0, 0.0 } },
{ 'T', { 0.0, 0.0, 1.0 } },
{ 'U', { 0.0, 0.0, 1.0 } },
{ 'W', { 0.376, 0.376, 0.376 } },
{ 'S', { 0.376, 0.376, 0.376 } },
{ 'M', { 0.376, 0.376, 0.376 } },
{ 'K', { 0.376, 0.376, 0.376 } },
{ 'R', { 0.376, 0.376, 0.376 } },
{ 'Y', { 0.376, 0.376, 0.376 } },
{ 'B', { 0.5, 0.5, 0.5 } },
{ 'D', { 0.5, 0.5, 0.5 } },
{ 'H', { 0.5, 0.5, 0.5 } },
{ 'V', { 0.5, 0.5, 0.5 } },
{ 'N', { 0.5, 0.5, 0.5 } },
{ 'O', { 0.5, 0.5, 0.5 } },
{ 'X', { 0.5, 0.5, 0.5 } },
{ '.', { 0.5, 0.5, 0.5 } },
{ '-', { 0.5, 0.5, 0.5 } },
{ '?', { 0.5, 0.5, 0.5 } }
}

Definition at line 173 of file codes.cpp.

const std::map<char, std::string> nucleic_acid_text_colors_map
static
Initial value:
= {
{ 'A', "Red" },
{ 'C', "Green" },
{ 'G', "Yellow" },
{ 'T', "Blue" },
{ 'U', "Blue" },
{ 'W', "DarkGray" },
{ 'S', "DarkGray" },
{ 'M', "DarkGray" },
{ 'K', "DarkGray" },
{ 'R', "DarkGray" },
{ 'Y', "DarkGray" },
{ 'B', "DarkGray" },
{ 'D', "DarkGray" },
{ 'H', "DarkGray" },
{ 'V', "DarkGray" },
{ 'N', "DarkGray" },
{ 'O', "DarkGray" },
{ 'X', "DarkGray" },
{ '.', "DarkGray" },
{ '-', "DarkGray" },
{ '?', "DarkGray" }
}

Definition at line 113 of file codes.cpp.