A library for working with phylogenetic and population genetic data.
v0.32.0
genesis::sequence Namespace Reference

Classes

class  FastaReader
 Read Fasta sequence data. More...
 
class  FastaWriter
 Write Fasta data. More...
 
class  FastqReader
 Read Fastq sequence data. More...
 
class  FastqWriter
 Write Fastq data. More...
 
class  FastxInputStream
 Stream through an input source and parse it as Fasta or Fastq sequences. More...
 
class  FastxOutputStream
 Write Fasta or Fastq data, sequentially. More...
 
class  Kmer
 Kmer class template for representing k-mers of various sizes, currently up to k-32. More...
 
class  KmerScanner
 
struct  KmerTagDefault
 Default Tag for a Kmer, used when no other Tag is provided. More...
 
struct  LabelAttributes
 
class  MicrovariantScanner
 
class  NucleotideAlphabet
 
class  PhylipReader
 Read Phylip sequence data. More...
 
class  PhylipWriter
 
class  PrinterBitmap
 Print the sites of a SequenceSet as pixels in a bitmap. More...
 
class  PrinterSimple
 Simple printer class for Sequences and SequenceSets. More...
 
class  ReferenceGenome
 Lookup of Sequences of a reference genome. More...
 
class  Sequence
 
class  SequenceDict
 Store dictionary/index data on sequence files, such as coming from .fai or .dict files. More...
 
class  SequenceSet
 Store a set of Sequences. More...
 
class  SignatureSpecifications
 Specifications for calculating signatures (like k-mer counts) from Sequences. More...
 
class  SiteCounts
 Store counts of the occurence for certain characters at the sites of Sequences. More...
 

Functions

double absolute_entropy (SiteCounts const &counts, SiteEntropyOptions per_site_options=SiteEntropyOptions::kDefault)
 Return the sum of all site entropies. More...
 
double absolute_information (SiteCounts const &counts, bool use_small_sample_correction=false, SiteEntropyOptions per_site_options=SiteEntropyOptions::kDefault)
 Calculate the information content across all sites of a SiteCounts object. More...
 
std::string amino_acid_codes_all ()
 Return all valid amino acid codes, and .. Those are ACDEFGHIKLMNOPQRSTUVWYBJZX*-?.. More...
 
std::string amino_acid_codes_degenerated ()
 Return all degenerated amino acid codes. Those are BJZ. More...
 
std::string amino_acid_codes_plain ()
 Return all plain amino acid codes. Those are ACDEFGHIKLMNOPQRSTUVWY. More...
 
std::string amino_acid_codes_undetermined ()
 Return all undetermined amino acid codes, and .. Those are X*-?.. More...
 
std::map< char, utils::Coloramino_acid_colors ()
 Return a map of Colors for each amino acid code. More...
 
std::string amino_acid_name (char code)
 Get the name of a amino acid given its IUPAC code. More...
 
std::map< char, std::string > amino_acid_text_colors ()
 Return a map of text colors for each amino acid code. More...
 
double average_entropy (SiteCounts const &counts, bool only_determined_sites=false, SiteEntropyOptions per_site_options=SiteEntropyOptions::kDefault)
 Return the average sum of all site entropies. More...
 
double average_information (SiteCounts const &counts, bool only_determined_sites=false, bool use_small_sample_correction=false, SiteEntropyOptions per_site_options=SiteEntropyOptions::kDefault)
 Calculate the information content across all sites of a SiteCounts object. More...
 
std::map< char, double > base_frequencies (Sequence const &seq, std::string const &plain_chars)
 Get the base frequencies of the sites in a Sequence given the base chars. More...
 
std::map< char, double > base_frequencies (SequenceSet const &set, std::string const &plain_chars)
 Get the base frequencies of the sites in a SequenceSet given the base chars. More...
 
static std::map< char, double > base_frequencies_accumulator (std::map< char, size_t > const &sitehistogram, std::string const &plain_chars)
 Local helper function that turns a site histogram into base frequencies. More...
 
template<typename Tag >
Kmer< Tag > canonical_representation (Kmer< Tag > const &kmer)
 
bool compatible_quality_encodings (QualityEncoding lhs, QualityEncoding rhs)
 Return whether two quality encodings are compatible with each other. More...
 
bool compatible_references (SequenceDict const &lhs, SequenceDict const &rhs, ReferenceComparisonMode mode=ReferenceComparisonMode::kStrict)
 Verify that a SequenceDict fits a SequenceSet. More...
 
std::string consensus_sequence_cavener (SequenceSet const &sequences, bool allow_gaps=true)
 Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities. More...
 
std::string consensus_sequence_cavener (SiteCounts const &counts, bool allow_gaps=true)
 Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities. More...
 
template<typename CharSelector >
std::string consensus_sequence_template (SiteCounts const &counts, bool const allow_gaps, CharSelector const &char_selector)
 Local helper function template that handles the common code for the nucleic acid consensus sequence functions. More...
 
std::string consensus_sequence_with_ambiguities (SequenceSet const &sequences, double similarity_factor=0.9, bool allow_gaps=true)
 Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities. More...
 
std::string consensus_sequence_with_ambiguities (SiteCounts const &counts, double similarity_factor=0.9, bool allow_gaps=true)
 Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities. More...
 
std::string consensus_sequence_with_majorities (SequenceSet const &sequences, bool allow_gaps=true)
 Calculate the majority rule consensus sequence by using the most frequent character at each site for nucleic acid codes ACGT. More...
 
std::string consensus_sequence_with_majorities (SequenceSet const &sequences, std::string const &characters, bool allow_gaps=true, char gap_char='-')
 Calculate the majority rule consensus sequence by using the most frequent character at each site. More...
 
std::string consensus_sequence_with_majorities (SiteCounts const &counts, bool allow_gaps=true, char gap_char='-')
 Calculate the majority rule consensus sequence by using the most frequent character at each site. More...
 
std::string consensus_sequence_with_threshold (SequenceSet const &sequences, double frequency_threshold=0.6, bool allow_gaps=true, bool use_ambiguities=true)
 Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT. More...
 
std::string consensus_sequence_with_threshold (SiteCounts const &counts, double frequency_threshold=0.6, bool allow_gaps=true, bool use_ambiguities=true)
 Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT. More...
 
size_t count_chars (SequenceSet const &set, std::string const &chars)
 Count the number of occurrences of the given chars within the sites of the SequenceSet. More...
 
static bool CountPairComparator (CountPair const &lhs, CountPair const &rhs)
 Local helper function to sort a CountPair. More...
 
unsigned char error_probability_to_phred_score (double error_probability)
 
std::vector< unsigned char > error_probability_to_phred_score (std::vector< double > error_probability)
 
signed char error_probability_to_solexa_score (double error_probability)
 
std::vector< signed char > error_probability_to_solexa_score (std::vector< double > error_probability)
 
template<class StringType >
static void fastq_writer_write_sequence_helper_ (StringType const &label, StringType const &sites, StringType const &quality, size_t line_length, bool repeat_label, std::ostream &os)
 Local function template that does the actual work. More...
 
void filter_by_label_list (SequenceSet &set, std::unordered_set< std::string > const &labels, bool invert=false)
 Remove all those Sequences from a SequenceSet whose labels are in the given list. More...
 
void filter_max_sequence_length (SequenceSet &set, size_t max_length)
 Remove all Sequences from the SequenceSet whose length is above the given max_length. More...
 
void filter_min_max_sequence_length (SequenceSet &set, size_t min_length, size_t max_length)
 Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length. More...
 
void filter_min_sequence_length (SequenceSet &set, size_t min_length)
 Remove all Sequences from the SequenceSet whose length is below the given min_length. More...
 
Sequence const * find_sequence (SequenceSet const &set, std::string const &label)
 Return a pointer to a Sequence with a specific label, or nullptr iff not found. More...
 
utils::Bitvector find_sites (Sequence const &seq, std::string const &chars)
 Find sites by character and mark them in a Bitvector. More...
 
utils::Bitvector find_sites (Sequence const &seq, utils::CharLookup< bool > const &chars)
 Find sites by character and mark them in a Bitvector. More...
 
size_t gap_site_count (SiteCounts const &counts)
 
utils::Bitvector gap_sites (Sequence const &seq, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Return a Bitvector that is true where the Sequence has a gap and false where not. More...
 
utils::Bitvector gap_sites (SequenceSet const &set, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Return a Bitvector that is true where all Sequences in the SequenceSet have a gap and false where not, that is, where at least on Sequence is not a gap. More...
 
double gapyness (SequenceSet const &set, std::string const &gap_chars)
 Return the "gapyness" of the Sequences, i.e., the proportion of gap chars and other completely undetermined chars to the total length of all sequences. More...
 
QualityEncoding guess_fastq_quality_encoding (std::shared_ptr< utils::BaseInputSource > source, size_t max_lines=0, size_t max_chars=0)
 Guess the quality score encoding for a fastq input, based on counts of how often each char appeared in the quality string (of the input fastq file for example). More...
 
QualityEncoding guess_quality_encoding (std::array< size_t, 128 > const &char_counts)
 Guess the quality score encoding, based on counts of how often each char appeared in the quality string (of a fastq file for example). More...
 
QualityEncoding guess_quality_encoding_from_name (std::string const &name)
 Guess the QualityEncoding type, given its description name. More...
 
std::pair< std::string, size_t > guess_sequence_abundance (Sequence const &sequence)
 Guess the abundance of a Sequence, using it's label. More...
 
std::pair< std::string, size_t > guess_sequence_abundance (std::string const &label)
 Guess the abundance of a Sequence, given it's label. More...
 
bool has_unique_labels (SequenceSet const &set, bool case_sensitive=true)
 Return true iff all labels of the Sequences in the SequenceSet are unique. More...
 
bool has_valid_label (Sequence const &seq)
 Check whether a Sequence has a valid label. More...
 
bool has_valid_labels (SequenceSet const &set)
 Check whether all Sequences in a SequenceSet have valid labels. More...
 
bool is_alignment (SequenceSet const &set)
 Return true iff all Sequences in the SequenceSet have the same length. More...
 
bool is_valid_label (std::string const &label)
 Check whether a given string is a valid label for a Sequence. More...
 
template<typename Tag >
utils::Range< MicrovariantScanner< Tag > > iterate_microvariants (Kmer< Tag > const &kmer)
 
std::string kmer_string_overlapping (Sequence const &sequence, SignatureSpecifications const &settings)
 Return the sequence spitted into overlapping k-mers. More...
 
void kmer_string_overlapping (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
 Print the sequence spitted into overlapping k-mers. More...
 
static void kmer_string_overlapping_line (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
 Local helper function that writes an overlapping kmer string to a stream. More...
 
static void kmer_string_single_kmer (Sequence const &sequence, SignatureSpecifications const &settings, size_t start, std::ostream &out)
 Local helper function that writes one kmer string to a stream. More...
 
std::vector< std::string > kmer_strings_non_overlapping (Sequence const &sequence, SignatureSpecifications const &settings)
 Return the sequence spitted into a set of non-overlapping k-mers. More...
 
void kmer_strings_non_overlapping (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
 Print the sequence spitted into non-overlapping k-mers. More...
 
static void kmer_strings_non_overlapping_line (Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out, size_t offset)
 Local helper function that does one line of a non overlapping kmer string. More...
 
LabelAttributes label_attributes (Sequence const &sequence)
 Get the attributes list (semicolons-separated) from a Sequence. More...
 
LabelAttributes label_attributes (std::string const &label)
 Get the attributes list (semicolons-separated) from a Sequence, given it's label. More...
 
std::unordered_set< std::string > labels (SequenceSet const &set)
 Return a set of all labels of the SequenceSet. More...
 
size_t longest_sequence_length (SequenceSet const &set)
 Return the length of the longest Sequence in the SequenceSet. More...
 
void merge_duplicate_sequences (SequenceSet &set, MergeDuplicateSequencesCountPolicy count_policy=MergeDuplicateSequencesCountPolicy::kDiscard, std::string const &counter_prefix="_")
 Merge all Sequences in a SequenceSet that have identical sites. More...
 
char normalize_amino_acid_code (char code, bool accept_degenerated=true)
 Normalize an amino acid code. More...
 
void normalize_amino_acid_codes (Sequence &sequence, bool accept_degenerated=true)
 Call normalize_amino_acid_code() for each site of the Sequence. More...
 
void normalize_amino_acid_codes (SequenceSet &sequence_set, bool accept_degenerated=true)
 Call normalize_amino_acid_code() for each site of all Sequences in the SequenceSet. More...
 
std::string normalize_code_alphabet (std::string const &alphabet)
 Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates. More...
 
char normalize_nucleic_acid_code (char code, bool accept_degenerated=true)
 Normalize a nucleic acide code. More...
 
void normalize_nucleic_acid_codes (Sequence &sequence, bool accept_degenerated=true)
 Call normalize_nucleic_acid_code() for each site of the Sequence. More...
 
void normalize_nucleic_acid_codes (SequenceSet &sequence_set, bool accept_degenerated=true)
 Call normalize_nucleic_acid_code() for each site of all Sequences in the SequenceSet. More...
 
std::string nucleic_acid_ambiguities (char code)
 Return the possible ambiguous nucleic acid codes for a given code char. More...
 
char nucleic_acid_ambiguity_code (std::string codes)
 Return the nucleic acid code that represents all given codes. More...
 
bool nucleic_acid_code_containment (char a, char b, bool undetermined_matches_all=true)
 Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters into account. More...
 
std::string nucleic_acid_codes_all ()
 Return all valid nucleic acid codes. Those are ACGTUWSMKRYBDHVNOX.-?. More...
 
std::string nucleic_acid_codes_all_letters ()
 Return all valid nucleic acid codes. Those are ACGTUWSMKRYBDHVNOX, that is, excluding .-? as compared to nucleic_acid_codes_all(). More...
 
std::string nucleic_acid_codes_degenerated ()
 Return all degenerated nucleic acid codes. Those are WSMKRYBDHV. More...
 
std::string nucleic_acid_codes_plain ()
 Return all plain nucleic acid codes. Those are ACGTU. More...
 
std::string nucleic_acid_codes_undetermined ()
 Return all undetermined nucleic acid codes. Those are NOX.-?. More...
 
std::string nucleic_acid_codes_undetermined_letters ()
 Return all undetermined nucleic acid codes that are letters. Those are NOX, that is, excluding .-? as compared to nucleic_acid_codes_undetermined(). More...
 
std::map< char, utils::Colornucleic_acid_colors ()
 Return a map of Colors for each nucleic acid code. More...
 
std::string nucleic_acid_name (char code)
 Get the name of a nucleic acid given its IUPAC code. More...
 
std::map< char, std::string > nucleic_acid_text_colors ()
 Return a map of text colors for each nucleic acid code. More...
 
char nucleic_acid_transition (char code)
 Return the transition base for the given base. More...
 
size_t number_of_kmers (size_t k, size_t alphabet_size=4)
 Compute the total number of possible k-mers for a given k and alphabet_size. More...
 
template<typename Tag >
bool operator!= (Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
 
bool operator& (SiteEntropyOptions lhs, SiteEntropyOptions rhs)
 And-operator to check whether a SiteEntropyOptions is set. More...
 
template<typename Tag >
bool operator< (Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
 
template<typename Tag >
std::ostream & operator<< (std::ostream &os, Kmer< Tag > const &kmer)
 
std::ostream & operator<< (std::ostream &out, Sequence const &seq)
 Print a Sequence to an ostream in the form "label: sites". More...
 
std::ostream & operator<< (std::ostream &out, SequenceSet const &set)
 Print a SequenceSet to an ostream in the form "label: sites". More...
 
template<typename Tag >
bool operator<= (Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
 
template<typename Tag >
bool operator== (Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
 
template<typename Tag >
bool operator> (Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
 
template<typename Tag >
bool operator>= (Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
 
SiteEntropyOptions operator| (SiteEntropyOptions lhs, SiteEntropyOptions rhs)
 Or-operator to combine two SiteEntropyOptionss. More...
 
SiteEntropyOptionsoperator|= (SiteEntropyOptions &lhs, SiteEntropyOptions rhs)
 Or-assignment-operator to combine two SiteEntropyOptionss. More...
 
std::vector< double > phred_score_to_error_probability (std::vector< unsigned char > phred_score)
 
double phred_score_to_error_probability (unsigned char phred_score)
 
std::vector< signed char > phred_score_to_solexa_score (std::vector< unsigned char > phred_score)
 
signed char phred_score_to_solexa_score (unsigned char phred_score)
 
unsigned char quality_decode_to_phred_score (char quality_code, QualityEncoding encoding=QualityEncoding::kSanger)
 Decode a single quality score char (for example coming from a fastq file) to a phred score. More...
 
std::vector< unsigned char > quality_decode_to_phred_score (std::string const &quality_codes, QualityEncoding encoding=QualityEncoding::kSanger)
 Decode a string of quality scores (for example coming from a fastq file) to phred scores. More...
 
std::string quality_encode_from_phred_score (std::vector< unsigned char > const &phred_scores, bool clamp=true)
 Encode phred scores into quality chars, using the Sanger convention. More...
 
char quality_encode_from_phred_score (unsigned char phred_score, bool clamp=true)
 Encode a phred score into a quality char, using the Sanger convention. More...
 
std::string quality_encoding_name (QualityEncoding encoding, bool with_offset=false)
 Return a readable name for each of the encoding types. More...
 
SequenceDict read_sequence_dict (std::shared_ptr< utils::BaseInputSource > source)
 Read a .dict sequence dictionary file, describing, e.g., reference genome sequence properties. More...
 
SequenceDict read_sequence_fai (std::shared_ptr< utils::BaseInputSource > source)
 Read a .fai sequence index file, describing, e.g., reference genome sequence properties. More...
 
SequenceDict reference_genome_to_dict (ReferenceGenome const &rg)
 Get the sequence dict/index information of a given set of Sequences that are stored in a ReferenceGenome. More...
 
void relabel_with_hash (Sequence &seq, utils::HashingFunctions hash_function)
 Relabel the Sequence using the hash digest of its sites. More...
 
void relabel_with_hash (SequenceSet &set, utils::HashingFunctions hash_function)
 Relabel all Sequences in the SequenceSet using the hash digest of the sites. More...
 
void remove_all_gaps (Sequence &seq, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Remove all gap characters from the sites of the Sequence. More...
 
void remove_all_gaps (SequenceSet &set, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Remove all gap characters from the sites of the Sequences in the SequenceSet. More...
 
void remove_characters (Sequence &seq, std::string const &search)
 Remove all of the characters in search from the sites of the Sequence. More...
 
void remove_characters (SequenceSet &set, std::string const &search)
 Remove all of the characters in search from the sites of the Sequences in the SequenceSet. More...
 
void remove_gap_sites (SequenceSet &set, std::string const &gap_chars=nucleic_acid_codes_undetermined())
 Remove all sites that only contain gap characters from the SequenceSet. More...
 
void remove_sites (Sequence &seq, utils::Bitvector sites)
 Remove all sites from a Sequence where the given Bitvector is true, and keep all others. More...
 
void remove_sites (SequenceSet &set, utils::Bitvector sites)
 Remove all sites from all Sequences in a SequenceSet where the given Bitvector is true, and keep all others. More...
 
void replace_characters (Sequence &seq, std::string const &search, char replacement)
 Replace all occurences of the chars in search by the replace char, for all sites in the given Sequence. More...
 
void replace_characters (SequenceSet &set, std::string const &search, char replacement)
 Replace all occurences of the chars in search by the replace char, for all sites in the Sequences in the given SequenceSet. More...
 
void replace_t_with_u (Sequence &seq)
 Replace all occurrences of T by U in the sites of the Sequence. More...
 
void replace_t_with_u (SequenceSet &set)
 Replace all occurrences of T by U in the sites of all Sequences in the SequenceSet. More...
 
void replace_u_with_t (Sequence &seq)
 Replace all occurrences of U by T in the sites of the Sequence. More...
 
void replace_u_with_t (SequenceSet &set)
 Replace all occurrences of U by T in the sites of all Sequences in the SequenceSet. More...
 
template<typename Tag >
Kmer< Tag > reverse_complement (Kmer< Tag > const &kmer)
 
std::string reverse_complement (std::string const &sequence, bool accept_degenerated=true)
 Get the reverse complement of a nucleic acid sequence. More...
 
void sanitize_label (Sequence &seq)
 Sanitize a label by replacing all invalid characters with underscores. More...
 
std::string sanitize_label (std::string const &label)
 Sanitize a label by replacing all invalid characters with underscores. More...
 
void sanitize_labels (SequenceSet &set)
 Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with underscores. More...
 
template<class T >
SequenceDict sequence_iterable_to_dict_ (T const &input)
 
SequenceDict sequence_set_to_dict (SequenceSet const &set)
 Get the sequence dict/index information of a given set of Sequences. More...
 
template<class Combinator >
std::vector< double > signature_complementarity_frequencies_helper (Sequence const &sequence, SignatureSpecifications const &settings, Combinator combinator)
 Local helper function that returns a vector where the frequencies of the non-palindromic kmers are combined using a functor. More...
 
std::vector< size_t > signature_counts (Sequence const &sequence, SignatureSpecifications const &settings)
 Count the occurences of k-mers in the sequence according to the settings. More...
 
std::vector< double > signature_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the frequencies of occurences of k-mers in the sequence according to the settings. More...
 
std::vector< double > signature_frequency_ratios_1 (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the ratio 1 signature of a sequence. More...
 
std::vector< double > signature_frequency_ratios_2 (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the ratio 2 signature of a sequence. More...
 
std::vector< double > signature_jensen_shannon (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the Jensen-Shannon (JS) signature of a sequence. More...
 
std::vector< double > signature_maximal_complementarity_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the signature of a sequence that uses the maximum frequency of reverse complement k-mers. More...
 
std::vector< double > signature_minimal_complementarity_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the signature of a sequence that uses the minimum frequency of reverse complement k-mers. More...
 
std::vector< size_t > signature_ranks (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the rank signature of a sequence according to the settings. More...
 
std::vector< double > signature_reverse_identity_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calculate the signature of a sequence that uses only the frequencies of k-mers whose reverse complement is the k-mer itself. More...
 
std::vector< size_t > signature_symmetrized_counts (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the symmetrized counts of the sequence according to the settings. More...
 
std::vector< double > signature_symmetrized_frequencies (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the symmetrized counts of the sequence according to the settings. More...
 
template<class T >
std::vector< T > signature_symmetrized_helper (std::vector< T > const &kmers, SignatureSpecifications const &settings)
 Local helper function that adds up the values for reverse complement k-mers. More...
 
std::vector< size_t > signature_symmetrized_ranks (Sequence const &sequence, SignatureSpecifications const &settings)
 Calcuate the symmetrized rank signature of a sequence according to the settings. More...
 
double site_entropy (SiteCounts const &counts, size_t site_index, SiteEntropyOptions options=SiteEntropyOptions::kDefault)
 Calculate the entropy at one site of a SiteCounts object. More...
 
std::map< char, size_t > site_histogram (Sequence const &seq)
 Get a histogram of the occurrences of particular sites, given a Sequence. More...
 
std::map< char, size_t > site_histogram (SequenceSet const &set)
 Get a histogram of the occurrences of particular sites, given a SequenceSet. More...
 
double site_information (SiteCounts const &counts, size_t site_index, bool use_small_sample_correction=false, SiteEntropyOptions options=SiteEntropyOptions::kDefault)
 Calculate the information content at one site of a SiteCounts object. More...
 
double solexa_score_to_error_probability (signed char solexa_score)
 
std::vector< double > solexa_score_to_error_probability (std::vector< signed char > solexa_score)
 
unsigned char solexa_score_to_phred_score (signed char solexa_score)
 
std::vector< unsigned char > solexa_score_to_phred_score (std::vector< signed char > solexa_score)
 
void throw_invalid_quality_code_ (char quality_code, QualityEncoding encoding)
 Local helper function to throw if any invalid fastq quality chars are being used. More...
 
template<typename Tag >
std::string to_string (Kmer< Tag > const &kmer)
 
size_t total_length (SequenceSet const &set)
 Return the total length (sum) of all Sequences in the SequenceSet. More...
 
bool validate_chars (SequenceSet const &set, std::string const &chars)
 Returns true iff all Sequences only consist of the given chars. More...
 
bool verify (SequenceDict const &dict, SequenceSet const &set, bool match_first_word=false)
 Verify that a SequenceDict fits a SequenceSet. More...
 

Enumerations

enum  MergeDuplicateSequencesCountPolicy { kDiscard, kAppendToLabel }
 Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences. More...
 
enum  QualityEncoding {
  kSanger, kSolexa, kIllumina13, kIllumina15,
  kIllumina18
}
 List of quality encodings for which we support decoding. More...
 
enum  ReferenceComparisonMode { kStrict, kLeftSuperset, kRightSuperset, kSharedOnly }
 Chose how to deal with sub-/super-sets when comparing references. More...
 
enum  SiteEntropyOptions : unsigned char { kDefault = 0, kIncludeGaps = 1, kWeighted = 2, kNormalized = 4 }
 Option flags to refine the calculation of site_entropy(). More...
 

Typedefs

using CountPair = std::pair< size_t, char >
 Helper struct to store characters sorted by their count. More...
 
using FastaInputStream = FastxInputStream< FastaReader >
 
using FastaOutputStream = FastxOutputStream< FastaWriter >
 
using FastqInputStream = FastxInputStream< FastqReader >
 
using FastqOutputStream = FastxOutputStream< FastqWriter >
 

Variables

static const std::unordered_map< char, std::string > amino_acid_code_to_name
 
static const std::map< char, utils::Coloramino_acid_colors_map
 
static const std::map< char, std::string > amino_acid_text_colors_map
 
static const std::unordered_map< char, std::string > nucleic_acid_ambiguity_char_map
 
static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
 
static const std::unordered_map< char, std::string > nucleic_acid_code_to_name
 
static const std::map< char, utils::Colornucleic_acid_colors_map
 
static const std::map< char, std::string > nucleic_acid_text_colors_map
 
static const std::array< double, 256 > phred_score_to_error_probability_lookup_
 

Function Documentation

◆ absolute_entropy()

double absolute_entropy ( SiteCounts const &  counts,
SiteEntropyOptions  per_site_options = SiteEntropyOptions::kDefault 
)

Return the sum of all site entropies.

This function simply sums up up the site_entropy() for all sites of the SiteCounts object. The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 136 of file sequence/functions/entropy.cpp.

◆ absolute_information()

double absolute_information ( SiteCounts const &  counts,
bool  use_small_sample_correction = false,
SiteEntropyOptions  per_site_options = SiteEntropyOptions::kDefault 
)

Calculate the information content across all sites of a SiteCounts object.

See site_information() for details. This function builds the sum of all per-site information content values.

The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 181 of file sequence/functions/entropy.cpp.

◆ amino_acid_codes_all()

std::string amino_acid_codes_all ( )

Return all valid amino acid codes, and .. Those are ACDEFGHIKLMNOPQRSTUVWYBJZX*-?..

See amino_acid_codes_undetermined() for the reason that . is included here. Basically, bioinformatics standards are a mess.

Definition at line 348 of file codes.cpp.

◆ amino_acid_codes_degenerated()

std::string amino_acid_codes_degenerated ( )

Return all degenerated amino acid codes. Those are BJZ.

Definition at line 338 of file codes.cpp.

◆ amino_acid_codes_plain()

std::string amino_acid_codes_plain ( )

Return all plain amino acid codes. Those are ACDEFGHIKLMNOPQRSTUVWY.

Definition at line 333 of file codes.cpp.

◆ amino_acid_codes_undetermined()

std::string amino_acid_codes_undetermined ( )

Return all undetermined amino acid codes, and .. Those are X*-?..

The typical undetermined codes for amino acids are X*-?, but some programs (e.g., hmmalign) also use the ., borrowed from nucleotide codes, as an undetermined character. Hence, we also support this here.

Definition at line 343 of file codes.cpp.

◆ amino_acid_colors()

std::map< char, utils::Color > amino_acid_colors ( )

Return a map of Colors for each amino acid code.

This function gives a Color for each amino acid code.

Definition at line 677 of file codes.cpp.

◆ amino_acid_name()

std::string amino_acid_name ( char  code)

Get the name of a amino acid given its IUPAC code.

The codes are translated as follows:

A Alanine
B Aspartic acid or Asparagine
C Cysteine
D Aspartic acid
E Glutamic acid
F Phenylalanine
G Glycine
H Histidine
I Isoleucine
J Leucine or Isoleucine
K Lysine
L Leucine
M Methionine
N Asparagine
O Pyrrolysine
P Proline
Q Glutamine
R Arginine
S Serine
T Threonine
U Selenocysteine
V Valine
W Tryptophan
Y Tyrosine
Z Glutamic acid or Glutamine
X any
* translation stop
- gap
? gap

The code char is treated case-insensitive. If the given code char is not valid, an std::out_of_range exception is thrown.

Definition at line 695 of file codes.cpp.

◆ amino_acid_text_colors()

std::map< char, std::string > amino_acid_text_colors ( )

Return a map of text colors for each amino acid code.

This function gives a color name usable for utils::Style for each amino acid code. The return value of this function can for example be used in sequence::print_color() function.

Definition at line 667 of file codes.cpp.

◆ average_entropy()

double average_entropy ( SiteCounts const &  counts,
bool  only_determined_sites = false,
SiteEntropyOptions  per_site_options = SiteEntropyOptions::kDefault 
)

Return the average sum of all site entropies.

This function sums up up the site_entropy() for all sites of the SiteCounts object and returns the average result per site.

If only_determined_sites is false (default), the average is calculated using the total number of sites, that is, it simply calculates the average entropy per site.

If only_determined_sites is true, the average is calculated using the number of determined sites only; that is, sites that only contain zeroes in all counts are skipped. Those sites do not contribute entropy anyway. Thus, it calcuates the average entropy per determiend site.

The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 147 of file sequence/functions/entropy.cpp.

◆ average_information()

double average_information ( SiteCounts const &  counts,
bool  only_determined_sites = false,
bool  use_small_sample_correction = false,
SiteEntropyOptions  per_site_options = SiteEntropyOptions::kDefault 
)

Calculate the information content across all sites of a SiteCounts object.

See site_information() for details. This function builds the sum of all per-site information content values.

If only_determined_sites is false (default), the average is calculated using the total number of sites, that is, it simply calculates the average entropy per site.

If only_determined_sites is true, the average is calculated using the number of determined sites only; that is, sites that only contain zeroes in all counts are skipped. Those sites do not contribute entropy anyway. Thus, it calcuates the average entropy per determiend site.

The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 193 of file sequence/functions/entropy.cpp.

◆ base_frequencies() [1/2]

std::map< char, double > base_frequencies ( Sequence const &  seq,
std::string const &  plain_chars 
)

Get the base frequencies of the sites in a Sequence given the base chars.

This returns the relative proportions of the given plain_chars to each other. Typically, the given chars come from either nucleic_acid_codes_plain() or amino_acid_codes_plain(), depending on the dataset.

It is necessary to select those chars on a per-dataset basis, as it is up to the user to define the meaning of those chars.

Definition at line 135 of file sequence/functions/stats.cpp.

◆ base_frequencies() [2/2]

std::map< char, double > base_frequencies ( SequenceSet const &  set,
std::string const &  plain_chars 
)

Get the base frequencies of the sites in a SequenceSet given the base chars.

See the Sequence implementation of this function for details.

Definition at line 141 of file sequence/functions/stats.cpp.

◆ base_frequencies_accumulator()

static std::map<char, double> genesis::sequence::base_frequencies_accumulator ( std::map< char, size_t > const &  sitehistogram,
std::string const &  plain_chars 
)
static

Local helper function that turns a site histogram into base frequencies.

Definition at line 111 of file sequence/functions/stats.cpp.

◆ canonical_representation()

Kmer<Tag> genesis::sequence::canonical_representation ( Kmer< Tag > const &  kmer)

Definition at line 172 of file function.hpp.

◆ compatible_quality_encodings()

bool compatible_quality_encodings ( QualityEncoding  lhs,
QualityEncoding  rhs 
)

Return whether two quality encodings are compatible with each other.

This is the case when both have the same ASCII offset, except for Solexa, which is not compatible with any of the others. This function is meant as a check when for example the user specified one encoding, but functions such as guess_quality_encoding() and guess_fastq_quality_encoding() return a different one - the two encodings might be indistinguishable for that function, because the have the same offset. That also means that one can be used to decode the other - they are compatible.

Definition at line 401 of file quality.cpp.

◆ compatible_references()

bool compatible_references ( SequenceDict const &  lhs,
SequenceDict const &  rhs,
ReferenceComparisonMode  mode = ReferenceComparisonMode::kStrict 
)

Verify that a SequenceDict fits a SequenceSet.

The function return true iff the lhs has sequences of the same name and length, in the same order, as the rhs, and false otherwise. This takes a mode into consideration, to select how sequences are handled that are not in both sets. See ReferenceComparisonMode for details.

Note that this function is written for two SequenceDict instances for simplicity. It is however meant to be use to compare for instance ReferenceGenome or SequenceSet instances as well. As the comparison is solely based on sequence names and lengths though, using this as a shared data structure to which all of these types can be converted to makes it easier to use. See sequence_set_to_dict() or reference_genome_to_dict() for these conversions for instance.

Definition at line 207 of file dict.cpp.

◆ consensus_sequence_cavener() [1/2]

std::string consensus_sequence_cavener ( SequenceSet const &  sequences,
bool  allow_gaps = true 
)

Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities.

See consensus_sequence_cavener(...) for details. This is merely a wrapper function that takes a SequenceSet instead of a SiteCounts object.

Definition at line 528 of file consensus.cpp.

◆ consensus_sequence_cavener() [2/2]

std::string consensus_sequence_cavener ( SiteCounts const &  counts,
bool  allow_gaps = true 
)

Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambiguities.

For every site, the nucleotides are sorted by frequency. Then, the following checks are performed:

  1. A single nucleotide is used if its frequency is at least 50% and more than twice as high as the second most frequent nucleotide.
  2. If two nucleotides occur in at leat 75% of the sequences, and rule 1 does not apply, their ambiguity code is used (double-degenerate code).
  3. If none of the above applies, but one of the nucleotides has a count of zero, the (triple-degenerate) code of the other three nucleotides is used.
  4. In all other cases, the code 'N' is used.

The method is meant for finding a consensus sequence in sets of sequences without gaps. This implementation however also allows gaps, which are just treated as a normal character if allow_gaps is true (default). In this case, if any of rules 1-3 applies to a gap character, the result is simply reduced to a gap. That is, whenever a gap is used to form a degenerate code according to the rules, the whole code is reduced to a gap.

If however allow_gaps is false, gap characters are completely ignored from the counting.

The method was described in:

D. R. Cavener, "Comparison of the consensus sequence flanking translational start sites in Drosophila and vertebrates", Nucleic Acids Res., vol. 15, no. 4, 1987.

The method is also used by the Transfac Project:

V. Matys et al., "TRANSFAC: Transcriptional regulation, from patterns to profiles", Nucleic Acids Res., vol. 31, no. 1, pp. 374–378, 2003.

Definition at line 466 of file consensus.cpp.

◆ consensus_sequence_template()

std::string genesis::sequence::consensus_sequence_template ( SiteCounts const &  counts,
bool const  allow_gaps,
CharSelector const &  char_selector 
)

Local helper function template that handles the common code for the nucleic acid consensus sequence functions.

Definition at line 73 of file consensus.cpp.

◆ consensus_sequence_with_ambiguities() [1/2]

std::string consensus_sequence_with_ambiguities ( SequenceSet const &  sequences,
double  similarity_factor = 0.9,
bool  allow_gaps = true 
)

Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities.

See consensus_sequence_with_ambiguities(...) for details. This is merely a wrapper function that takes a SequenceSet instead of a SiteCounts object.

Definition at line 324 of file consensus.cpp.

◆ consensus_sequence_with_ambiguities() [2/2]

std::string consensus_sequence_with_ambiguities ( SiteCounts const &  counts,
double  similarity_factor = 0.9,
bool  allow_gaps = true 
)

Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities.

The function calculates a consensus sequence for nucleic acid codes (ACGT), using their ambiguity codes (e.g., W for "weak" == AT) if the counts (i.e., character frequencies) are similar at a site. It uses similarity_factor to decide which counts are close enough to each other in order to be considered ambiguous.

For example, with similarity_factor == 1.0, only exact matches are used, that is, if two counts are exactly the same. Let count('A') == 42 and count('T') == 42, and both other counts (C and G) be 0, this results in the code W at that site. If however count('T') == 41, only A is used for the site. Thus, with similarity_factor == 1.0, this function behaves very similar to consensus_sequence_with_majorities(), except in cases were two counts are exaclty the same.

On the other hand, with similarity_factor == 0.0, all codes that are present at a site are considered to be ambiguous. That is, if a site contains counts > 0 for A, G and T, the resulting site gets the code D ("not C").

For intermediate values, e.g., the default 0.9, the value is used as a threshold to decide the ambiguities. For example, let count('A') == 42 and count('T') == 38, and both other counts be 0. Then, the allowed deviation from the maximum 42 is 0.9 * 42 = 37.8. Thus, as the count for T is above this value, those two codes are considered ambiguous, resulting in a W at that site.

The optional parameter allow_gaps (default is true) behaves similar to its counterpart in consensus_sequence_with_majorities(). If set to true, the count of the gap character is also considered. As the ambiguity for a gap combined with any other character is still a gap, sites where gap is the most frequent charater or is within the deviation get a gap as result.

If allow_gaps is set to false instead, gaps are not considered. That means, the ambiguities are calculated as if there were no gaps. So even if a site contains mostly gaps, but only a few other characters, those will be used. Solely all-gap sites result in a gap at that site.

Remark: As this function expects nucleic acid codes, the gap character is fixed to '-' here. The ambiguity codes are converted using nucleic_acid_ambiguity_code(). See there for more information on the used codes.

If the provided SiteCounts object does not use nucleic acid codes, or if the similarity_factor is not within the range [ 0.0, 1.0 ], an exception is thrown.

Definition at line 253 of file consensus.cpp.

◆ consensus_sequence_with_majorities() [1/3]

std::string consensus_sequence_with_majorities ( SequenceSet const &  sequences,
bool  allow_gaps = true 
)

Calculate the majority rule consensus sequence by using the most frequent character at each site for nucleic acid codes ACGT.

See consensus_sequence_with_majorities(...) for details.

This function is merely a wrapper that instead of a SiteCounts objects, takes a SequenceSet object consisting of Sequences with nucleic acid codes ACGT and uses '-' for gaps.

Definition at line 237 of file consensus.cpp.

◆ consensus_sequence_with_majorities() [2/3]

std::string consensus_sequence_with_majorities ( SequenceSet const &  sequences,
std::string const &  characters,
bool  allow_gaps = true,
char  gap_char = '-' 
)

Calculate the majority rule consensus sequence by using the most frequent character at each site.

See consensus_sequence_with_majorities(...) for details.

This function is merely a wrapper that instead of a SiteCounts objects, takes a SequenceSet object and the set of characters to be used for counting character frequencies in the Sequences. That means, only the provided characters are counted and used for the consensus sequence. This is useful in order to get rid of errors and noise in the Sequences. For example, if you want to build a consensus sequence for a set of sequences with amino acid codes, use amino_acid_codes_plain() for set characters parameter.

Definition at line 212 of file consensus.cpp.

◆ consensus_sequence_with_majorities() [3/3]

std::string consensus_sequence_with_majorities ( SiteCounts const &  counts,
bool  allow_gaps = true,
char  gap_char = '-' 
)

Calculate the majority rule consensus sequence by using the most frequent character at each site.

The function creates a consensus sequence by using the character at each position that has the highest count (or frequency). It does not assume any character codes. Thus, it works for all kinds of sequence codes, e.g., nucleic acid or amino acid codes.

The optional parameter allow_gaps (default is true) determines whether gaps in the consensus sequence are allowed. By default, if a site consists mostly of gaps, the consensus sequence also contains a gap at that site. If however this option is set to false, the consensus sequence will contain the most frequent non-gap character, even if there are more gaps at this site than the character itself. In other words, if the parameter is set to false, gaps are treated as missing characters instead of another type of character for computing the consensus. The only exception are gaps-only sites; in this case, the resulting sites contain a gap characters even if the parameter is set to false.

The optional parameter gap_char (default value '-') is used for sites where no counts are available (i.e., are all zero), or, if allow_gaps is set to true, for sites that contain mostly gaps.

Furthermore, if two or more characters have the same frequency, the first one is used. That is, the one that appears first in SiteCounts::characters().

For an alternative version of this function that takes those ambiguities into account, see consensus_sequence_with_ambiguities(). Also, for a version of this function that takes a threshold for the character frequencies into account, see consensus_sequence_with_threshold(). However, both of them currently only work for nucleic acid codes (ACGT).

Definition at line 162 of file consensus.cpp.

◆ consensus_sequence_with_threshold() [1/2]

std::string consensus_sequence_with_threshold ( SequenceSet const &  sequences,
double  frequency_threshold = 0.6,
bool  allow_gaps = true,
bool  use_ambiguities = true 
)

Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT.

See consensus_sequence_with_ambiguities(...) for details. This is merely a wrapper function that takes a SequenceSet instead of a SiteCounts object.

Definition at line 432 of file consensus.cpp.

◆ consensus_sequence_with_threshold() [2/2]

std::string consensus_sequence_with_threshold ( SiteCounts const &  counts,
double  frequency_threshold = 0.6,
bool  allow_gaps = true,
bool  use_ambiguities = true 
)

Calculate a consensus sequence where the character frequency needs to be above a given threshold, for nucleic acid codes ACGT.

The function calculates a consensus sequence for nucleic acid codes (ACGT). It uses the frequency of characters at each site to determine the consensus. The frequency is relative to the total number of counts at that site, thus, it is a value in the range [ 0.0, 1.0 ].

If the frequency of a character at a site is above the given frequency_threshold, it is used for the consensus. If not, the resulting character depends on use_ambiguities. If use_ambiguities is set to true (default), the sorted frequencies of the characters are added until the threshold is reached, and the ambiguity code for those characters is used. For example, let frequency_threshold == 0.9, count('A') == 42 and count('T') == 42, and both other counts (C and G) be 0. Then, neither A nor T have counts above the threshold, but combined they do, so the result is code W == AT at that site. If however use_ambiguities is false, the mask character X is used for sites that are below the threshold.

Furthermore, if allow_gaps is set to true (default), gaps are counted when determining the threshold and checking whether the frequency is above it. That is, gaps are then treated as just another character at the site. For sites where the gap frequency is needed to reach the threshold, a gap is used as consensus. If allow_gaps is false, however, gaps are not counted for determining the frequency of the other characters. This is similar to the counterpart in consensus_sequence_with_majorities(). Solely sites that are gaps-only result in a gap char for the consensus then.

For frequency_threshold < 0.5, it may happen that more than one character has a frequency above the threshold. In such cases, the most frequent character is used (or, if they have exaclty the same counts, they are used in the order ACGT). This is in line with the behaviour of consensus_sequence_with_majorities(). Usually, however, the threshold is above 0.5 anyway, as this gives more meaningful results. If you want to use ambiguity characters for low frequency characters, you can use consensus_sequence_with_ambiguities() instead.

An extreme case is a frequency_threshold of 1.0. In this case, for sites which only have one character, this one is directly used in the consensus. Sites with multiple different characters result in the ambiguity code of all occuring characters at that site. Thus, the function then behaves similar to consensus_sequence_with_ambiguities() with a similarity_factor of 0.0.

The other extrem case is a frequency_threshold of 0.0. In this case, the function simply uses the most frequent character per site, as it always fulfills the threshold. As said above, if then more than one character has exactly the same frequency, they are used in the order ACGT, thus the function then behaves similar to consensus_sequence_with_majorities().

Remark: As this function expects nucleic acid codes, the gap character is fixed to '-' here, and the mask character to 'X'. The ambiguity codes are converted using nucleic_acid_ambiguity_code(). See there for more information on the used codes.

If the provided SiteCounts object does not use nucleic acid codes, or if the frequency_threshold is not within the range [ 0.0, 1.0 ], an exception is thrown.

Definition at line 352 of file consensus.cpp.

◆ count_chars()

size_t count_chars ( SequenceSet const &  set,
std::string const &  chars 
)

Count the number of occurrences of the given chars within the sites of the SequenceSet.

This function can be used to count e.g. gaps or ambiguous characters in sequences. For presettings of usable chars, see the functions nucleic_acid_codes_... and amino_acid_codes_.... The chars are treated case-insensitive.

If chars contains invalid (non-standard ASCII) characters, an std::invalid_argument exception is thrown.

Definition at line 151 of file sequence/functions/stats.cpp.

◆ CountPairComparator()

static bool genesis::sequence::CountPairComparator ( CountPair const &  lhs,
CountPair const &  rhs 
)
static

Local helper function to sort a CountPair.

The comparatorfirst sorts by count, and for equal counts, sorts alphanumercially by the char. Thus, gaps are always first (their ascii code is smaller than all letters).

Definition at line 63 of file consensus.cpp.

◆ error_probability_to_phred_score() [1/2]

unsigned char error_probability_to_phred_score ( double  error_probability)

Definition at line 565 of file quality.cpp.

◆ error_probability_to_phred_score() [2/2]

std::vector< unsigned char > error_probability_to_phred_score ( std::vector< double >  error_probability)

Definition at line 644 of file quality.cpp.

◆ error_probability_to_solexa_score() [1/2]

signed char error_probability_to_solexa_score ( double  error_probability)

Definition at line 587 of file quality.cpp.

◆ error_probability_to_solexa_score() [2/2]

std::vector< signed char > error_probability_to_solexa_score ( std::vector< double >  error_probability)

Definition at line 662 of file quality.cpp.

◆ fastq_writer_write_sequence_helper_()

static void genesis::sequence::fastq_writer_write_sequence_helper_ ( StringType const &  label,
StringType const &  sites,
StringType const &  quality,
size_t  line_length,
bool  repeat_label,
std::ostream &  os 
)
static

Local function template that does the actual work.

We introduce a template param for std::string and std::string_view, if C++17 is used.

Definition at line 57 of file fastq_writer.cpp.

◆ filter_by_label_list()

void filter_by_label_list ( SequenceSet set,
std::unordered_set< std::string > const &  labels,
bool  invert = false 
)

Remove all those Sequences from a SequenceSet whose labels are in the given list.

If invert is set to true, it does the same inverted: it removes all Sequences except those whose label is in the list.

Definition at line 281 of file labels.cpp.

◆ filter_max_sequence_length()

void filter_max_sequence_length ( SequenceSet set,
size_t  max_length 
)

Remove all Sequences from the SequenceSet whose length is above the given max_length.

See also filter_min_sequence_length() and filter_min_max_sequence_length().

Definition at line 416 of file sequence/functions/functions.cpp.

◆ filter_min_max_sequence_length()

void filter_min_max_sequence_length ( SequenceSet set,
size_t  min_length,
size_t  max_length 
)

Remove all Sequences from the SequenceSet whose length is not inbetween the min_length and max_length.

This function has the same effect as calling filter_min_sequence_length() and filter_max_sequence_length(), but does it in one iteration over the SequenceSet.

Definition at line 428 of file sequence/functions/functions.cpp.

◆ filter_min_sequence_length()

void filter_min_sequence_length ( SequenceSet set,
size_t  min_length 
)

Remove all Sequences from the SequenceSet whose length is below the given min_length.

See also filter_max_sequence_length() and filter_min_max_sequence_length().

Definition at line 404 of file sequence/functions/functions.cpp.

◆ find_sequence()

Sequence const * find_sequence ( SequenceSet const &  set,
std::string const &  label 
)

Return a pointer to a Sequence with a specific label, or nullptr iff not found.

Definition at line 54 of file labels.cpp.

◆ find_sites() [1/2]

utils::Bitvector find_sites ( Sequence const &  seq,
std::string const &  chars 
)

Find sites by character and mark them in a Bitvector.

The function iterates the sites of a Sequence and checkes whether the char at a site is in the provided set of chars (case insensitive). If so, the resulting Bitvector is set to true at that position.

Definition at line 60 of file sequence/functions/functions.cpp.

◆ find_sites() [2/2]

utils::Bitvector find_sites ( Sequence const &  seq,
utils::CharLookup< bool > const &  chars 
)

Find sites by character and mark them in a Bitvector.

The function iterates the sites of a Sequence and checkes whether the char at a site is set to true in the provided lookup of chars. If so, the resulting Bitvector is set to true at that position.

Definition at line 71 of file sequence/functions/functions.cpp.

◆ gap_site_count()

size_t gap_site_count ( SiteCounts const &  counts)

Definition at line 190 of file sequence/functions/stats.cpp.

◆ gap_sites() [1/2]

utils::Bitvector gap_sites ( Sequence const &  seq,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Return a Bitvector that is true where the Sequence has a gap and false where not.

The gap_chars are used case-insensitively to determine what is considerted to be a gap. By default, nucleic_acid_codes_undetermined() are used, but any other set of characters is allowed.

Definition at line 82 of file sequence/functions/functions.cpp.

◆ gap_sites() [2/2]

utils::Bitvector gap_sites ( SequenceSet const &  set,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Return a Bitvector that is true where all Sequences in the SequenceSet have a gap and false where not, that is, where at least on Sequence is not a gap.

The gap_chars are used case-insensitively to determine what is considerted to be a gap. By default, nucleic_acid_codes_undetermined() are used, but any other set of characters is allowed.

Definition at line 87 of file sequence/functions/functions.cpp.

◆ gapyness()

double gapyness ( SequenceSet const &  set,
std::string const &  gap_chars 
)

Return the "gapyness" of the Sequences, i.e., the proportion of gap chars and other completely undetermined chars to the total length of all sequences.

This function returns a value in the interval 0.0 (no gaps and undetermined chars at all) and 1.0 (all chars are undetermined). See nucleic_acid_codes_undetermined() and amino_acid_codes_undetermined() for presettings of gap character that can be used here depending on the data set type. The chars are treated case-insensitive. In the special case that there are no sequences or sites, 0.0 is returned.

Definition at line 177 of file sequence/functions/stats.cpp.

◆ guess_fastq_quality_encoding()

QualityEncoding guess_fastq_quality_encoding ( std::shared_ptr< utils::BaseInputSource source,
size_t  max_lines = 0,
size_t  max_chars = 0 
)

Guess the quality score encoding for a fastq input, based on counts of how often each char appeared in the quality string (of the input fastq file for example).

The function reads and parses the input source as a fastq file, counts all quality score chars as they appear in there, and then guesses the encoding that was used. If max_lines is set to a value greater than 0, only that many lines are read. If max_chars is set to a value greater than 0, only that many quality score charaters are read.

See also
guess_quality_encoding()

Definition at line 498 of file quality.cpp.

◆ guess_quality_encoding()

QualityEncoding guess_quality_encoding ( std::array< size_t, 128 > const &  char_counts)

Guess the quality score encoding, based on counts of how often each char appeared in the quality string (of a fastq file for example).

The char_counts needs to be filled with counts of how often each quality code char appeared in the (fastq) quality strings. If any values outside of the printable character range (ASCII 33 to 127) are non-zero in the char_counts, the function throws, as these are invaliv qualiy encodings. Otherwise, it guesses which QualityEncoding was used for the data, based on which chars appear in it.

See also
guess_fastq_quality_encoding()

Definition at line 433 of file quality.cpp.

◆ guess_quality_encoding_from_name()

QualityEncoding guess_quality_encoding_from_name ( std::string const &  name)

Guess the QualityEncoding type, given its description name.

This is the reverse of quality_encoding_name(), but additionally allows the given name to be fuzzy. The name is stripped off all non-alphanumerical characters and made lower-case. The remainder (e.g., illumina13) then is matched to the enum QualityEncoding, with the additional option that just illumina (without any version number) is matched to QualityEncoding::kIllumina18.

Definition at line 197 of file quality.cpp.

◆ guess_sequence_abundance() [1/2]

std::pair< std::string, size_t > guess_sequence_abundance ( Sequence const &  sequence)

Guess the abundance of a Sequence, using it's label.

The function splits the label of a Sequence into two parts: the descriptive name of the sequence, and an abundance value (weight or multiplicity of the sequence), which are returned as a std::pair.

The function accepts two patterns of reporting abundances via the label() of a Sequence:

  • Appended via underscore: name_123. In this case, the number has to be the last in the label, that is, no other text may follow.
  • Using the format ;size=123;. The semicoli are mandatory, except the second one if nothing else follows in the label. See label_attributes() for details.

If neither of them is found, a default abundance of 1 is returned.

Definition at line 72 of file labels.cpp.

◆ guess_sequence_abundance() [2/2]

std::pair< std::string, size_t > guess_sequence_abundance ( std::string const &  label)

Guess the abundance of a Sequence, given it's label.

This is the same as guess_sequence_abundance( Sequence const& ), but takes the label as a string, instead of the Sequence object. See there for details.

Definition at line 77 of file labels.cpp.

◆ has_unique_labels()

bool has_unique_labels ( SequenceSet const &  set,
bool  case_sensitive = true 
)

Return true iff all labels of the Sequences in the SequenceSet are unique.

The optional parameter case_sensitive controls how labels are compared. Default is true, that is, Sequences are compared case-sensitively.

Definition at line 185 of file labels.cpp.

◆ has_valid_label()

bool has_valid_label ( Sequence const &  seq)

Check whether a Sequence has a valid label.

This might be important for printing the Sequence to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label. See sanitize_label() for a function that replaces all invalid characters of the label by underscores.

Definition at line 234 of file labels.cpp.

◆ has_valid_labels()

bool has_valid_labels ( SequenceSet const &  set)

Check whether all Sequences in a SequenceSet have valid labels.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label. See sanitize_labels() for a function that replaces all invalid characters of the labels by underscores.

Definition at line 239 of file labels.cpp.

◆ is_alignment()

bool is_alignment ( SequenceSet const &  set)

Return true iff all Sequences in the SequenceSet have the same length.

Definition at line 164 of file sequence/functions/functions.cpp.

◆ is_valid_label()

bool is_valid_label ( std::string const &  label)

Check whether a given string is a valid label for a Sequence.

While we can work with any form of label (as long as it is a string), most file formats and consequently most programs that read them restrict the set of valid characters for labels of sequences. We thus provide this function, which uses the most common interpretation of valid labels.

A label is valid if its characters have a graphical representation (i.e., isgraph() is true) and if non of these characters occurs:

:,();[]'

Thus, all whitespaces, control characters, and the listed special characters are invalid. See sanitize_label() for a function that replaces all invalid characters of the label by underscores.

Definition at line 223 of file labels.cpp.

◆ iterate_microvariants()

utils::Range<MicrovariantScanner<Tag> > genesis::sequence::iterate_microvariants ( Kmer< Tag > const &  kmer)
inline

Definition at line 212 of file microvariant_scanner.hpp.

◆ kmer_string_overlapping() [1/2]

std::string kmer_string_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Return the sequence spitted into overlapping k-mers.

The function takes the sequence and splits it into k-mers according to settings, using a space char as delimiter between k-mers, with overlap.

For example, the sequence ACGTACGT with k == 3 becomes

ACG CGT GTA TAC ACG CGT

The naming of the function follows

D. Kimothi, A. Soni, P. Biyani, and J. M. Hogan, “Distributed Representations for Biological Sequence Analysis,” CoRR, vol. abs/1608.0, 2016.

See there for details.

Definition at line 426 of file signatures.cpp.

◆ kmer_string_overlapping() [2/2]

void kmer_string_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out 
)

Print the sequence spitted into overlapping k-mers.

This is identical to kmer_string_overlapping( Sequence const&, SignatureSpecifications const& ), but prints directly to a stream, which is better for processing large files. After the k-mer sequence, a new line character is printed.

Definition at line 435 of file signatures.cpp.

◆ kmer_string_overlapping_line()

static void genesis::sequence::kmer_string_overlapping_line ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out 
)
static

Local helper function that writes an overlapping kmer string to a stream.

Definition at line 387 of file signatures.cpp.

◆ kmer_string_single_kmer()

static void genesis::sequence::kmer_string_single_kmer ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
size_t  start,
std::ostream &  out 
)
static

Local helper function that writes one kmer string to a stream.

Definition at line 357 of file signatures.cpp.

◆ kmer_strings_non_overlapping() [1/2]

std::vector< std::string > kmer_strings_non_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Return the sequence spitted into a set of non-overlapping k-mers.

The function takes the sequence and splits it into k-mers according to settings, using a space char as delimiter between k-mers, without overlap.

For example, the sequence ACGTACGTACGT with k == 3 becomes

ACG TAC GTA CGT
CGT ACG TAC
GTA CGT ACG

The naming of the function follows

D. Kimothi, A. Soni, P. Biyani, and J. M. Hogan, “Distributed Representations for Biological Sequence Analysis,” CoRR, vol. abs/1608.0, 2016.

See there for details.

Definition at line 448 of file signatures.cpp.

◆ kmer_strings_non_overlapping() [2/2]

void kmer_strings_non_overlapping ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out 
)

Print the sequence spitted into non-overlapping k-mers.

This is identical to kmer_strings_non_overlapping( Sequence const&, SignatureSpecifications const& ), but prints directly to a stream, which is better for processing large files. After each k-mer sequence, a new line character is printed.

Definition at line 468 of file signatures.cpp.

◆ kmer_strings_non_overlapping_line()

static void genesis::sequence::kmer_strings_non_overlapping_line ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
std::ostream &  out,
size_t  offset 
)
static

Local helper function that does one line of a non overlapping kmer string.

Definition at line 411 of file signatures.cpp.

◆ label_attributes() [1/2]

LabelAttributes label_attributes ( Sequence const &  sequence)

Get the attributes list (semicolons-separated) from a Sequence.

It is common to store additional information in sequence headers, e.g., in the fasta format, using a semicolon-separated list of attributes like this:

>some_name;size=123;thing=foo;

This function disects this kind of information and returns it. The returned struct contains the label (the part before the first semicolon), as well as a map for the attributes. As this is not a multimap, later attributes with the same key overwrite earlier ones.

If the sequence label does not contain any information that is separated via a semicolon, the attributes list is returned empty. However, if semicola are found in the label, the correct format is expected (with the syntax ;key=value;) for each attribute. Otherwise, an exception is thrown. The last semicolon is optional; that is, the label can simply end after the last value.

Definition at line 155 of file labels.cpp.

◆ label_attributes() [2/2]

LabelAttributes label_attributes ( std::string const &  label)

Get the attributes list (semicolons-separated) from a Sequence, given it's label.

This is the same as label_attributes( Sequence const& ), but takes the label as a string, instead of the Sequence object. See there for details.

Definition at line 160 of file labels.cpp.

◆ labels()

std::unordered_set< std::string > labels ( SequenceSet const &  set)

Return a set of all labels of the SequenceSet.

Definition at line 64 of file labels.cpp.

◆ longest_sequence_length()

size_t longest_sequence_length ( SequenceSet const &  set)

Return the length of the longest Sequence in the SequenceSet.

Definition at line 146 of file sequence/functions/functions.cpp.

◆ merge_duplicate_sequences()

void merge_duplicate_sequences ( SequenceSet set,
MergeDuplicateSequencesCountPolicy  count_policy = MergeDuplicateSequencesCountPolicy::kDiscard,
std::string const &  counter_prefix = "_" 
)

Merge all Sequences in a SequenceSet that have identical sites.

The merging is done by removing all but the first Sequence with identical sites. That means, the resulting "representative" of a set of merged Sequences has the label of the original Sequence that was first in the SequenceSet.

Using the MergeDuplicateSequencesCountPolicy, it is possible to store the counts of the Sequences (i.e., how often they appeard before merging) within the label of the Sequence, separated by counter_prefix. With the default settings, the count is appended to the label, separated by an underscore.

Definition at line 304 of file sequence/functions/functions.cpp.

◆ normalize_amino_acid_code()

char normalize_amino_acid_code ( char  code,
bool  accept_degenerated = true 
)

Normalize an amino acid code.

That is, make it upper case and replace all undetermined chars by -. See amino_acid_codes_undetermined() for a list of the latter.

If accept_degenerated is set to true (default), degenerated chars are just put to upper case, but otherwise left as they are. If set to false, an exception is thrown if a degenerated char is encountered. See amino_acid_codes_degenerated() for their list.

Lastly, an exception is also thrown for non amino acid codes, that is all chars that are not part of amino_acid_codes_all().

Definition at line 427 of file codes.cpp.

◆ normalize_amino_acid_codes() [1/2]

void normalize_amino_acid_codes ( Sequence sequence,
bool  accept_degenerated = true 
)

Call normalize_amino_acid_code() for each site of the Sequence.

See there for details.

Definition at line 386 of file sequence/functions/functions.cpp.

◆ normalize_amino_acid_codes() [2/2]

void normalize_amino_acid_codes ( SequenceSet sequence_set,
bool  accept_degenerated = true 
)

Call normalize_amino_acid_code() for each site of all Sequences in the SequenceSet.

See there for details.

Definition at line 393 of file sequence/functions/functions.cpp.

◆ normalize_code_alphabet()

std::string normalize_code_alphabet ( std::string const &  alphabet)

Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.

For example, when given a set of nucleic acid codes like aGtc, the function returns ACGT. This is useful to get consistent codes in functions that accept a user defined code alphabet.

Definition at line 359 of file codes.cpp.

◆ normalize_nucleic_acid_code()

char normalize_nucleic_acid_code ( char  code,
bool  accept_degenerated = true 
)

Normalize a nucleic acide code.

That is, make it upper case, replace U by T, replace all undetermined chars by -. See nucleic_acid_codes_undetermined() for a list of the latter.

If accept_degenerated is set to true (default), degenerated chars are just put to upper case, but otherwise left as they are. If set to false, an exception is thrown if a degenerated char is encountered. See nucleic_acid_codes_degenerated() for their list.

Lastly, an exception is also thrown for non nucleic acid codes, that is all chars that are not part of nucleic_acid_codes_all().

Definition at line 368 of file codes.cpp.

◆ normalize_nucleic_acid_codes() [1/2]

void normalize_nucleic_acid_codes ( Sequence sequence,
bool  accept_degenerated = true 
)

Call normalize_nucleic_acid_code() for each site of the Sequence.

See there for details.

Definition at line 372 of file sequence/functions/functions.cpp.

◆ normalize_nucleic_acid_codes() [2/2]

void normalize_nucleic_acid_codes ( SequenceSet sequence_set,
bool  accept_degenerated = true 
)

Call normalize_nucleic_acid_code() for each site of all Sequences in the SequenceSet.

See there for details.

Definition at line 379 of file sequence/functions/functions.cpp.

◆ nucleic_acid_ambiguities()

std::string nucleic_acid_ambiguities ( char  code)

Return the possible ambiguous nucleic acid codes for a given code char.

The codes are resolved as follows:

'A' ==> "A"
'C' ==> "C"
'G' ==> "G"
'T' ==> "T"
'U' ==> "T"

'W' ==> "AT"
'S' ==> "CG"
'M' ==> "AC"
'K' ==> "GT"
'R' ==> "AG"
'Y' ==> "CT"

'B' ==> "CGT"
'D' ==> "AGT"
'H' ==> "ACT"
'V' ==> "ACG"

'N' ==> "ACGT"
'O' ==> "-"
'X' ==> "-"
'.' ==> "-"
'-' ==> "-"
'?' ==> "-"

The code char is treated case-insensitive. If the given code char is not valid, an std::out_of_range exception is thrown.

See nucleic_acid_ambiguity_code() for a reverse version of this function. It is however not exactly the reverse, as some degenerated codes are mapped to the gap char. Thus, this function is not injective.

Definition at line 704 of file codes.cpp.

◆ nucleic_acid_ambiguity_code()

char nucleic_acid_ambiguity_code ( std::string  codes)

Return the nucleic acid code that represents all given codes.

The codes are resolved as follows:

"A"    ==> 'A'
"C"    ==> 'C'
"G"    ==> 'G'
"T"    ==> 'T'

"AT"   ==> 'W'
"CG"   ==> 'S'
"AC"   ==> 'M'
"GT"   ==> 'K'
"AG"   ==> 'R'
"CT"   ==> 'Y'

"CGT"  ==> 'B'
"AGT"  ==> 'D'
"ACT"  ==> 'H'
"ACG"  ==> 'V'

"ACGT" ==> 'N'
"-"    ==> '-'

The given codes are treated case-insensitive and order-independent. For example, given "tCgG", the function still returns ‘'B’. However, if any of the given codes is not valid, an std::out_of_range` exception is thrown.

See nucleic_acid_ambiguities() for the reverse of this function.

Definition at line 713 of file codes.cpp.

◆ nucleic_acid_code_containment()

bool nucleic_acid_code_containment ( char  a,
char  b,
bool  undetermined_matches_all = true 
)

Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters into account.

That is, 'A' and 'W' yield true, as 'W' contains 'A' and 'T'. The order and casing of the input does not matter. The parameter undetermined_matches_all selects how undetermined characters ("NOX.-?") are treated: if set to true (default), they match ALL other chars, if set to false, they match none.

Definition at line 596 of file codes.cpp.

◆ nucleic_acid_codes_all()

std::string nucleic_acid_codes_all ( )

Return all valid nucleic acid codes. Those are ACGTUWSMKRYBDHVNOX.-?.

Definition at line 315 of file codes.cpp.

◆ nucleic_acid_codes_all_letters()

std::string nucleic_acid_codes_all_letters ( )

Return all valid nucleic acid codes. Those are ACGTUWSMKRYBDHVNOX, that is, excluding .-? as compared to nucleic_acid_codes_all().

Definition at line 322 of file codes.cpp.

◆ nucleic_acid_codes_degenerated()

std::string nucleic_acid_codes_degenerated ( )

Return all degenerated nucleic acid codes. Those are WSMKRYBDHV.

Definition at line 300 of file codes.cpp.

◆ nucleic_acid_codes_plain()

std::string nucleic_acid_codes_plain ( )

Return all plain nucleic acid codes. Those are ACGTU.

Definition at line 295 of file codes.cpp.

◆ nucleic_acid_codes_undetermined()

std::string nucleic_acid_codes_undetermined ( )

Return all undetermined nucleic acid codes. Those are NOX.-?.

Definition at line 305 of file codes.cpp.

◆ nucleic_acid_codes_undetermined_letters()

std::string nucleic_acid_codes_undetermined_letters ( )

Return all undetermined nucleic acid codes that are letters. Those are NOX, that is, excluding .-? as compared to nucleic_acid_codes_undetermined().

Definition at line 310 of file codes.cpp.

◆ nucleic_acid_colors()

std::map< char, utils::Color > nucleic_acid_colors ( )

Return a map of Colors for each nucleic acid code.

This function gives a Color for each nucleic acid code.

Definition at line 672 of file codes.cpp.

◆ nucleic_acid_name()

std::string nucleic_acid_name ( char  code)

Get the name of a nucleic acid given its IUPAC code.

The codes are translated as follows:

A Adenine
C Cytosine
G Guanine
T Thymine
U Uracil
W Weak
S Strong
M aMino
K Keto
R puRine
Y pYrimidine
B not A
D not C
H not G
V not T
N any
O omitted
X masked
. gap
- gap
? gap

The code char is treated case-insensitive. If the given code char is not valid, an std::out_of_range exception is thrown.

Definition at line 686 of file codes.cpp.

◆ nucleic_acid_text_colors()

std::map< char, std::string > nucleic_acid_text_colors ( )

Return a map of text colors for each nucleic acid code.

This function gives a color name usable for utils::Style for each nucleic acid code. The return value of this function can for example be used in sequence::print_color() function.

Definition at line 662 of file codes.cpp.

◆ nucleic_acid_transition()

char nucleic_acid_transition ( char  code)

Return the transition base for the given base.

This yields the transition bases, that is A <-> G and C <-> T, respecitvely. Any other char will throw an exception.

Definition at line 572 of file codes.cpp.

◆ number_of_kmers()

size_t genesis::sequence::number_of_kmers ( size_t  k,
size_t  alphabet_size = 4 
)
inline

Compute the total number of possible k-mers for a given k and alphabet_size.

The default for alphabet_size is 4, for typical nucleotide k-mers over the alphabet ACGT. For instance, with k==6, this yields 4*4*4*4*4*4 == 4096 possible k-mers of that size.

Definition at line 56 of file function.hpp.

◆ operator!=()

bool genesis::sequence::operator!= ( Kmer< Tag > const &  lhs,
Kmer< Tag > const &  rhs 
)

Definition at line 78 of file function.hpp.

◆ operator&()

bool genesis::sequence::operator& ( SiteEntropyOptions  lhs,
SiteEntropyOptions  rhs 
)
inline

And-operator to check whether a SiteEntropyOptions is set.

Typical usage:

SiteEntropyOptions options;
if( options & SiteEntropyOptions::kWeighted ) {
    // Do stuff...
}

Use the or-operator in order to set and combine options.

Definition at line 190 of file sequence/functions/entropy.hpp.

◆ operator<()

bool genesis::sequence::operator< ( Kmer< Tag > const &  lhs,
Kmer< Tag > const &  rhs 
)

Definition at line 96 of file function.hpp.

◆ operator<<() [1/3]

std::ostream& genesis::sequence::operator<< ( std::ostream &  os,
Kmer< Tag > const &  kmer 
)

Definition at line 135 of file function.hpp.

◆ operator<<() [2/3]

std::ostream & operator<< ( std::ostream &  out,
Sequence const &  seq 
)

Print a Sequence to an ostream in the form "label: sites".

As this is meant for quickly having a look at the Sequence, only the first 100 sites are printed. If you need all sites, or more settings like color, use SimplePrinter.

Definition at line 449 of file sequence/functions/functions.cpp.

◆ operator<<() [3/3]

std::ostream & operator<< ( std::ostream &  out,
SequenceSet const &  set 
)

Print a SequenceSet to an ostream in the form "label: sites".

As this is meant for quickly having a look at the SequenceSet, only the first 10 Sequences and the first 100 sites of each are printed. If you need all sequences and sites, or more settings like color, use SimplePrinter.

Definition at line 458 of file sequence/functions/functions.cpp.

◆ operator<=()

bool genesis::sequence::operator<= ( Kmer< Tag > const &  lhs,
Kmer< Tag > const &  rhs 
)

Definition at line 84 of file function.hpp.

◆ operator==()

bool genesis::sequence::operator== ( Kmer< Tag > const &  lhs,
Kmer< Tag > const &  rhs 
)

Definition at line 72 of file function.hpp.

◆ operator>()

bool genesis::sequence::operator> ( Kmer< Tag > const &  lhs,
Kmer< Tag > const &  rhs 
)

Definition at line 102 of file function.hpp.

◆ operator>=()

bool genesis::sequence::operator>= ( Kmer< Tag > const &  lhs,
Kmer< Tag > const &  rhs 
)

Definition at line 90 of file function.hpp.

◆ operator|()

SiteEntropyOptions genesis::sequence::operator| ( SiteEntropyOptions  lhs,
SiteEntropyOptions  rhs 
)
inline

Or-operator to combine two SiteEntropyOptionss.

Typical usage:

auto options = SiteEntropyOptions::kWeighted | SiteEntropyOptions::kNormalized;

Use the and-operator in order to check whether an option is set.

Definition at line 149 of file sequence/functions/entropy.hpp.

◆ operator|=()

SiteEntropyOptions& genesis::sequence::operator|= ( SiteEntropyOptions lhs,
SiteEntropyOptions  rhs 
)
inline

Or-assignment-operator to combine two SiteEntropyOptionss.

Typical usage:

SiteEntropyOptions options;
options |= SiteEntropyOptions::kWeighted;

Use the and-operator in order to check whether an option is set.

Definition at line 168 of file sequence/functions/entropy.hpp.

◆ phred_score_to_error_probability() [1/2]

std::vector< double > phred_score_to_error_probability ( std::vector< unsigned char >  phred_score)

Definition at line 653 of file quality.cpp.

◆ phred_score_to_error_probability() [2/2]

double phred_score_to_error_probability ( unsigned char  phred_score)

Definition at line 581 of file quality.cpp.

◆ phred_score_to_solexa_score() [1/2]

std::vector< signed char > phred_score_to_solexa_score ( std::vector< unsigned char >  phred_score)

Definition at line 680 of file quality.cpp.

◆ phred_score_to_solexa_score() [2/2]

signed char phred_score_to_solexa_score ( unsigned char  phred_score)

Definition at line 624 of file quality.cpp.

◆ quality_decode_to_phred_score() [1/2]

unsigned char quality_decode_to_phred_score ( char  quality_code,
QualityEncoding  encoding = QualityEncoding::kSanger 
)

Decode a single quality score char (for example coming from a fastq file) to a phred score.

The function allows to use different types of quality encoding as used by different sequencing platforms/technologies. This format confusion is messy, see the FastqReader class for details.

Note that Sanger as well as the Illumina encodings are simply encoded as phred plus ASCII offset, while Solexa uses a formula based on odds instead of probability. Hence, when specifying Solexa here, we internally convert to phred before returning the result here.

Definition at line 221 of file quality.cpp.

◆ quality_decode_to_phred_score() [2/2]

std::vector< unsigned char > quality_decode_to_phred_score ( std::string const &  quality_codes,
QualityEncoding  encoding = QualityEncoding::kSanger 
)

Decode a string of quality scores (for example coming from a fastq file) to phred scores.

Decode a single quality score char (for example coming from a fastq file) to a phred score. The function allows to use different types of quality encoding as used by different sequencing platforms/technologies. This format confusion is messy, see the FastqReader class for details.

Note that Sanger as well as the Illumina encodings are simply encoded as phred plus ASCII offset, while Solexa uses a formula based on odds instead of probability. Hence, when specifying Solexa here, we internally convert to phred before returning the result here.

Definition at line 256 of file quality.cpp.

◆ quality_encode_from_phred_score() [1/2]

std::string genesis::sequence::quality_encode_from_phred_score ( std::vector< unsigned char > const &  phred_scores,
bool  clamp = true 
)
inline

Encode phred scores into quality chars, using the Sanger convention.

Encode a phred score into a quality char, using the Sanger convention. This function takes a phred_score in the range 0 to 93, and encodes it, for example for usage in a fastq file, by adding the ASCII offset 33 to it.

While we can decode from numerous formats, see quality_decode_to_phred_score(), we only support encoding back to the Sanger format, because we want to minimize confusion and maximize compatability with other programs. Also, Sanger is used by the NCBI Short Read Archive and Illumina 1.8+, and hence the most common format as of today.

If the flag clamp is set (default), values outside of the valid range 0 to 93 are clamped, that is, set to be inside the valid range. As the phred score is unsigned, this leads to values above 93 simply being encoded as if they were exactly 93. If clamp is set to false, an exception is thrown instead if a value above 93 is encountered.

Definition at line 162 of file quality.hpp.

◆ quality_encode_from_phred_score() [2/2]

char genesis::sequence::quality_encode_from_phred_score ( unsigned char  phred_score,
bool  clamp = true 
)
inline

Encode a phred score into a quality char, using the Sanger convention.

This function takes a phred_score in the range 0 to 93, and encodes it, for example for usage in a fastq file, by adding the ASCII offset 33 to it.

While we can decode from numerous formats, see quality_decode_to_phred_score(), we only support encoding back to the Sanger format, because we want to minimize confusion and maximize compatability with other programs. Also, Sanger is used by the NCBI Short Read Archive and Illumina 1.8+, and hence the most common format as of today.

If the flag clamp is set (default), values outside of the valid range 0 to 93 are clamped, that is, set to be inside the valid range. As the phred score is unsigned, this leads to values above 93 simply being encoded as if they were exactly 93. If clamp is set to false, an exception is thrown instead if a value above 93 is encountered.

Definition at line 141 of file quality.hpp.

◆ quality_encoding_name()

std::string quality_encoding_name ( QualityEncoding  encoding,
bool  with_offset = false 
)

Return a readable name for each of the encoding types.

See QualityEncoding for the names being used here. If with_offset is set, the offset of the encoding is also reported in parenthesis (e.g., "Illumina 1.8+ (ASCII offset 33)").

Definition at line 177 of file quality.cpp.

◆ read_sequence_dict()

SequenceDict read_sequence_dict ( std::shared_ptr< utils::BaseInputSource source)

Read a .dict sequence dictionary file, describing, e.g., reference genome sequence properties.

The file format is for example produced by GATK/Picard CreateSequenceDictionary or by samtools dict. More information on the format can be found at:

See there for details.

Definition at line 51 of file dict.cpp.

◆ read_sequence_fai()

SequenceDict read_sequence_fai ( std::shared_ptr< utils::BaseInputSource source)

Read a .fai sequence index file, describing, e.g., reference genome sequence properties.

The file format is for example produced by samtools faidx. More information on the format can be found at:

See there for details.

Definition at line 131 of file dict.cpp.

◆ reference_genome_to_dict()

SequenceDict reference_genome_to_dict ( ReferenceGenome const &  rg)

Get the sequence dict/index information of a given set of Sequences that are stored in a ReferenceGenome.

This is a helper function that can be used to obtain an overview of the names and lengths of a set of sequences, for some downstream processing, or as information about a reference genome.

See also
sequence_set_to_dict()

Definition at line 198 of file dict.cpp.

◆ relabel_with_hash() [1/2]

void relabel_with_hash ( Sequence seq,
utils::HashingFunctions  hash_function 
)

Relabel the Sequence using the hash digest of its sites.

See utils::HashingFunctions for the available hashing functions.

Definition at line 206 of file labels.cpp.

◆ relabel_with_hash() [2/2]

void relabel_with_hash ( SequenceSet set,
utils::HashingFunctions  hash_function 
)

Relabel all Sequences in the SequenceSet using the hash digest of the sites.

See utils::HashingFunctions for the available hashing functions.

If there are duplicate Sequences, this function will lead to multiple Sequences with the same name, which might be an issue for downstream programs that expect unique labels. See has_unique_labels() to check this.

Definition at line 212 of file labels.cpp.

◆ remove_all_gaps() [1/2]

void remove_all_gaps ( Sequence seq,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Remove all gap characters from the sites of the Sequence.

This function is an alias for remove_characters(), which by default uses the gap sites of nucleic_acid_codes_undetermined().

Definition at line 244 of file sequence/functions/functions.cpp.

◆ remove_all_gaps() [2/2]

void remove_all_gaps ( SequenceSet set,
std::string const &  gap_chars = nucleic_acid_codes_undetermined() 
)

Remove all gap characters from the sites of the Sequences in the SequenceSet.

This function is an alias for remove_characters(), which by default uses the gap sites of nucleic_acid_codes_undetermined().

Definition at line 249 of file sequence/functions/functions.cpp.

◆ remove_characters() [1/2]

void remove_characters ( Sequence seq,
std::string const &  search 
)

Remove all of the characters in search from the sites of the Sequence.

Definition at line 227 of file sequence/functions/functions.cpp.

◆ remove_characters() [2/2]

void remove_characters ( SequenceSet set,
std::string const &  search 
)

Remove all of the characters in search from the sites of the Sequences in the SequenceSet.

Definition at line 237 of file sequence/functions/functions.cpp.

◆ remove_gap_sites()

void remove_gap_sites ( SequenceSet set,
std::string const &  gap_chars 
)

Remove all sites that only contain gap characters from the SequenceSet.

Definition at line 221 of file sequence/functions/functions.cpp.

◆ remove_sites() [1/2]

void remove_sites ( Sequence seq,
utils::Bitvector  sites 
)

Remove all sites from a Sequence where the given Bitvector is true, and keep all others.

The Bitvector needs to have the same size as the Sequence, otherwise an expection is thrown.

This function is for example useful in combination with gap_sites().

Definition at line 183 of file sequence/functions/functions.cpp.

◆ remove_sites() [2/2]

void remove_sites ( SequenceSet set,
utils::Bitvector  sites 
)

Remove all sites from all Sequences in a SequenceSet where the given Bitvector is true, and keep all others.

The Bitvector and all Sequences need to have the same size, otherwise an expection is thrown. This check is done before any Sequence is changed. Thus, if the function throws for this reason, the Sequences are left unchanged.

This function is for example useful in combination with gap_sites().

Definition at line 205 of file sequence/functions/functions.cpp.

◆ replace_characters() [1/2]

void replace_characters ( Sequence seq,
std::string const &  search,
char  replacement 
)

Replace all occurences of the chars in search by the replace char, for all sites in the given Sequence.

The function is case sensitive. Thus, you need to use both cases for the search string if you are unsure. The replace char is always used as is, independent of the case of the matching search char.

Definition at line 254 of file sequence/functions/functions.cpp.

◆ replace_characters() [2/2]

void replace_characters ( SequenceSet set,
std::string const &  search,
char  replacement 
)

Replace all occurences of the chars in search by the replace char, for all sites in the Sequences in the given SequenceSet.

The function is case sensitive. Thus, you need to use both cases for the search string if you are unsure. The replace char is always used as is, independent of the case of the matching search char.

Definition at line 259 of file sequence/functions/functions.cpp.

◆ replace_t_with_u() [1/2]

void replace_t_with_u ( Sequence seq)

Replace all occurrences of T by U in the sites of the Sequence.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case t is replaced by lower case u, and upper case T by upper case U.

Definition at line 285 of file sequence/functions/functions.cpp.

◆ replace_t_with_u() [2/2]

void replace_t_with_u ( SequenceSet set)

Replace all occurrences of T by U in the sites of all Sequences in the SequenceSet.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case t is replaced by lower case u, and upper case T by upper case U.

Definition at line 297 of file sequence/functions/functions.cpp.

◆ replace_u_with_t() [1/2]

void replace_u_with_t ( Sequence seq)

Replace all occurrences of U by T in the sites of the Sequence.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case u is replaced by lower case t, and upper case U by upper case T.

Definition at line 266 of file sequence/functions/functions.cpp.

◆ replace_u_with_t() [2/2]

void replace_u_with_t ( SequenceSet set)

Replace all occurrences of U by T in the sites of all Sequences in the SequenceSet.

This is a small helper function for sequences with nucleic acid codes. It is case sensitive, that is, lower case u is replaced by lower case t, and upper case U by upper case T.

Definition at line 278 of file sequence/functions/functions.cpp.

◆ reverse_complement() [1/2]

Kmer<Tag> genesis::sequence::reverse_complement ( Kmer< Tag > const &  kmer)

Definition at line 146 of file function.hpp.

◆ reverse_complement() [2/2]

std::string reverse_complement ( std::string const &  sequence,
bool  accept_degenerated = true 
)

Get the reverse complement of a nucleic acid sequence.

That is, reverse the string and flip A with T and C with G. Gap characters are normalized to -, and an exception is thrown for invalid characters.

If furthermore accept_degenerated is true (default), degenerated codes are also flipped. For example M == AC becomes K == TG, W == AT stays the same, and B == CGT becomes V = GCA. If set to false, an exception is thrown when degenerated chars are found.

Definition at line 501 of file codes.cpp.

◆ sanitize_label() [1/2]

void sanitize_label ( Sequence seq)

Sanitize a label by replacing all invalid characters with underscores.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label.

Definition at line 265 of file labels.cpp.

◆ sanitize_label() [2/2]

std::string sanitize_label ( std::string const &  label)

Sanitize a label by replacing all invalid characters with underscores.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label.

Definition at line 249 of file labels.cpp.

◆ sanitize_labels()

void sanitize_labels ( SequenceSet set)

Sanitize the labels of all Sequences in the SequenceSet by replacing all invalid characters with underscores.

This might be important for printing the Sequences to a file that needs to be read by other applications. See is_valid_label() for details on what is considered a valid label.

Definition at line 270 of file labels.cpp.

◆ sequence_iterable_to_dict_()

SequenceDict genesis::sequence::sequence_iterable_to_dict_ ( T const &  input)

Definition at line 184 of file dict.cpp.

◆ sequence_set_to_dict()

SequenceDict sequence_set_to_dict ( SequenceSet const &  set)

Get the sequence dict/index information of a given set of Sequences.

This is a helper function that can be used to obtain an overview of the names and lengths of a set of sequences, for some downstream processing, or as information about a reference genome.

See also
reference_genome_to_dict()

Definition at line 193 of file dict.cpp.

◆ signature_complementarity_frequencies_helper()

std::vector<double> genesis::sequence::signature_complementarity_frequencies_helper ( Sequence const &  sequence,
SignatureSpecifications const &  settings,
Combinator  combinator 
)

Local helper function that returns a vector where the frequencies of the non-palindromic kmers are combined using a functor.

Definition at line 210 of file signatures.cpp.

◆ signature_counts()

std::vector< size_t > signature_counts ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Count the occurences of k-mers in the sequence according to the settings.

The function returns a vector that contains the count for each k-mer that can be build from the characters in the alphabet, in the order given by SignatureSpecifications::kmer_list().

Definition at line 57 of file signatures.cpp.

◆ signature_frequencies()

std::vector< double > signature_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the frequencies of occurences of k-mers in the sequence according to the settings.

The function simply calculates the frequencies as the normalized values of signature_counts().

Definition at line 117 of file signatures.cpp.

◆ signature_frequency_ratios_1()

std::vector< double > signature_frequency_ratios_1 ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the ratio 1 signature of a sequence.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 296 of file signatures.cpp.

◆ signature_frequency_ratios_2()

std::vector< double > signature_frequency_ratios_2 ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the ratio 2 signature of a sequence.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 316 of file signatures.cpp.

◆ signature_jensen_shannon()

std::vector< double > signature_jensen_shannon ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the Jensen-Shannon (JS) signature of a sequence.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

using details of

J. Lin, “Divergence Measures Based on the Shannon Entropy,” IEEE Trans. Inf. Theory, vol. 37, no. 1, pp. 145–151, 1991.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 334 of file signatures.cpp.

◆ signature_maximal_complementarity_frequencies()

std::vector< double > signature_maximal_complementarity_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the signature of a sequence that uses the maximum frequency of reverse complement k-mers.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 257 of file signatures.cpp.

◆ signature_minimal_complementarity_frequencies()

std::vector< double > signature_minimal_complementarity_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the signature of a sequence that uses the minimum frequency of reverse complement k-mers.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes palindromic k-mers where the reverse complement is the k-mer itself.

Definition at line 244 of file signatures.cpp.

◆ signature_ranks()

std::vector< size_t > signature_ranks ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the rank signature of a sequence according to the settings.

That is, standard ranking is applied to the k-mer counts of the Sequence.

Definition at line 183 of file signatures.cpp.

◆ signature_reverse_identity_frequencies()

std::vector< double > signature_reverse_identity_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calculate the signature of a sequence that uses only the frequencies of k-mers whose reverse complement is the k-mer itself.

The function is calculated according to

F. Gori, D. Mavroedis, M. S. M. Jetten, and E. Marchiori, “Genomic signatures for metagenomic data analysis: Exploiting the reverse complementarity of tetranucleotides,” 2011 IEEE Int. Conf. Syst. Biol. ISB 2011, pp. 149–154, 2011.

It excludes k-mers where the reverse complement is a different k-mer.

Definition at line 270 of file signatures.cpp.

◆ signature_symmetrized_counts()

std::vector< size_t > signature_symmetrized_counts ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the symmetrized counts of the sequence according to the settings.

The function uses signature_counts(), and sums up the counts of k-mers that are reverse complements of each other.

Definition at line 159 of file signatures.cpp.

◆ signature_symmetrized_frequencies()

std::vector< double > signature_symmetrized_frequencies ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the symmetrized counts of the sequence according to the settings.

The function uses signature_frequencies(), and sums up the counts of k-mers that are reverse complements of each other.

Definition at line 169 of file signatures.cpp.

◆ signature_symmetrized_helper()

std::vector<T> genesis::sequence::signature_symmetrized_helper ( std::vector< T > const &  kmers,
SignatureSpecifications const &  settings 
)

Local helper function that adds up the values for reverse complement k-mers.

Definition at line 142 of file signatures.cpp.

◆ signature_symmetrized_ranks()

std::vector< size_t > signature_symmetrized_ranks ( Sequence const &  sequence,
SignatureSpecifications const &  settings 
)

Calcuate the symmetrized rank signature of a sequence according to the settings.

That is, standard ranking is applied to the symmetrized (combined reverse complement) k-mer counts of the Sequence.

Definition at line 192 of file signatures.cpp.

◆ site_entropy()

double site_entropy ( SiteCounts const &  counts,
size_t  site_index,
SiteEntropyOptions  options = SiteEntropyOptions::kDefault 
)

Calculate the entropy at one site of a SiteCounts object.

The entropy \( H \) (uncertainty) at site \( i \) (= site_idx) is calculated as \( H_{i}=-\sum f_{{c,i}}\times \log _{2}f_{{c,i}} \), where \( f_{c,i} \) is the relative frequency of character \( c \) at site \( i \), summed over all characters in the SiteCounts object.

The function additionally takes optional flags to refine the calculation, see SiteEntropyOptions for their explanation.

Definition at line 49 of file sequence/functions/entropy.cpp.

◆ site_histogram() [1/2]

std::map< char, size_t > site_histogram ( Sequence const &  seq)

Get a histogram of the occurrences of particular sites, given a Sequence.

This gives the raw counts of how often each site (character) appears in the Sequence. See base_frequencies() for the relative version of this function.

Definition at line 53 of file sequence/functions/stats.cpp.

◆ site_histogram() [2/2]

std::map< char, size_t > site_histogram ( SequenceSet const &  set)

Get a histogram of the occurrences of particular sites, given a SequenceSet.

This gives the raw counts of how often each site (character) appears in the whole set. See base_frequencies() for the relative version of this function.

Definition at line 77 of file sequence/functions/stats.cpp.

◆ site_information()

double site_information ( SiteCounts const &  counts,
size_t  site_index,
bool  use_small_sample_correction = false,
SiteEntropyOptions  options = SiteEntropyOptions::kDefault 
)

Calculate the information content at one site of a SiteCounts object.

The information content \( R \) at site \( i \) (= site_index) is calculated as \( R_{i} = \log_{2}( s ) - (H_{i}+e_{n}) \).

Here, \( s \) is the number of possible characters in the sequences (usually, 4 for nucleic acids and 20 for amino acids), which is taken from the characters() used in the SiteCounts object. Furthermore, \( H_{i} \) is the site_entropy() at the given site.

The optional term \( e_{n} \) is the small-sample correction, calculated as \( e_{n}={\frac{1}{\ln {2}}}\times {\frac{s-1}{2n}} \), with \( n \) being the number of sequences. It is only used if use_small_sample_correction is set to true (default is false).

The function additionally takes optional flags to refine the site entropy calculation, see SiteEntropyOptions for their explanation.

Definition at line 110 of file sequence/functions/entropy.cpp.

◆ solexa_score_to_error_probability() [1/2]

double solexa_score_to_error_probability ( signed char  solexa_score)

Definition at line 615 of file quality.cpp.

◆ solexa_score_to_error_probability() [2/2]

std::vector< double > solexa_score_to_error_probability ( std::vector< signed char >  solexa_score)

Definition at line 671 of file quality.cpp.

◆ solexa_score_to_phred_score() [1/2]

unsigned char solexa_score_to_phred_score ( signed char  solexa_score)

Definition at line 636 of file quality.cpp.

◆ solexa_score_to_phred_score() [2/2]

std::vector< unsigned char > solexa_score_to_phred_score ( std::vector< signed char >  solexa_score)

Definition at line 689 of file quality.cpp.

◆ throw_invalid_quality_code_()

void genesis::sequence::throw_invalid_quality_code_ ( char  quality_code,
QualityEncoding  encoding 
)
inline

Local helper function to throw if any invalid fastq quality chars are being used.

Definition at line 169 of file quality.cpp.

◆ to_string()

std::string genesis::sequence::to_string ( Kmer< Tag > const &  kmer)

Definition at line 108 of file function.hpp.

◆ total_length()

size_t total_length ( SequenceSet const &  set)

Return the total length (sum) of all Sequences in the SequenceSet.

Definition at line 155 of file sequence/functions/functions.cpp.

◆ validate_chars()

bool validate_chars ( SequenceSet const &  set,
std::string const &  chars 
)

Returns true iff all Sequences only consist of the given chars.

For presettings of usable chars, see the functions nucleic_acid_codes_... and amino_acid_codes_.... For example, to check whether the sequences are nucleic acids, use nucleic_acid_codes_all(). The chars are treated case-insensitive.

If chars contains invalid (non-standard ASCII) characters, an std::invalid_argument exception is thrown.

Definition at line 121 of file sequence/functions/functions.cpp.

◆ verify()

bool verify ( SequenceDict const &  dict,
SequenceSet const &  set,
bool  match_first_word = false 
)

Verify that a SequenceDict fits a SequenceSet.

The function return true iff the dict has sequences of the same name and length, in the same order, as the set, and false otherwise.

If match_first_word is set, we cut off the sequence names after the first tab or space character, in case there are any, as this is what typical fasta indexing tools also seem to do. In that case, anything after the first tab or space is not used for the comparison.

Definition at line 280 of file dict.cpp.

Typedef Documentation

◆ CountPair

using CountPair = std::pair< size_t, char >

Helper struct to store characters sorted by their count.

Definition at line 55 of file consensus.cpp.

◆ FastaInputStream

◆ FastaOutputStream

◆ FastqInputStream

◆ FastqOutputStream

Enumeration Type Documentation

◆ MergeDuplicateSequencesCountPolicy

Provide options for changing how merge_duplicate_sequences() handles the counts of merged Sequences.

This allows to remove duplicate sequences while still keeping track of how many there were before merging them.

Enumerator
kDiscard 

The counts are discarded.

kAppendToLabel 

The counts are appended to the sequence label, separated by the counter_prefix.

Definition at line 251 of file sequence/functions/functions.hpp.

◆ QualityEncoding

enum QualityEncoding
strong

List of quality encodings for which we support decoding.

We offer the following quality score encodings:

  • Sanger, with offset 33.
  • Illumina 1.3+, with offset 64.
  • Illumina 1.5+, with offset 64.
  • Illumina 1.8+, with offset 33.
  • Solexa, with offset 64, and a special encoding equation.

These are the types of encodings used in fastq files over the years. It seems that Sanger / Illumina 1.8+ is the most commonly used one today, so this is also what we use as a default.

Enumerator
kSanger 
kSolexa 
kIllumina13 
kIllumina15 
kIllumina18 

Definition at line 72 of file quality.hpp.

◆ ReferenceComparisonMode

Chose how to deal with sub-/super-sets when comparing references.

When comparing different reference genomes (e.g, fasta files), dictionaries (e.g., dict or fai files), mask files, etc, we have to make a decision on whether to be strict about the sequences contained in each of them. For instance, a reference genome might contain chromosomes "1" to "5", while a mask file might only be containing a mask for chromosome "1". This mode enum selects what to do in these cases: Do we want to allow one of the two comparands to be a sub- or super-set of the other?

Typically, we then want to compare if the sequence lengths of all sequences in the two sets match. If that is not the case for the sequences that are in both sets, they are definitely not compatible. See compatible_references() for a function where this is used.

Enumerator
kStrict 

Both compared reference sets have to contain the exact same sequence names.

kLeftSuperset 

The left hand reference set is allowed to contain sequences that are not in the right hand side. These sequences are ignored in the comparison.

kRightSuperset 

The right hand reference set is allowed to contain sequences that are not in the lefthand side. These sequences are ignored in the comparison.

kSharedOnly 

Either reference set can contain sequences that are not in the other. Only the shared ones are used for the comparison.

Definition at line 114 of file dict.hpp.

◆ SiteEntropyOptions

enum SiteEntropyOptions : unsigned char
strong

Option flags to refine the calculation of site_entropy().

The flags can be combined via the binary or operator |:

auto flags = SiteEntropyOptions::kIncludeGaps | SiteEntropyOptions::kNormalized;

For checking whether a partcular option is set, use the binary and operator &:

if( flags & SiteEntropyOptions::kIncludeGaps ) {
    // ...
}

The option flags can be used with all functions that calculate the entropy. They are applied on a per-site basis; i.e., they are used for calculating the site_entropy(), which is the basis for higher level entropy functions like absolute_entropy() and averaged_entropy().

Enumerator
kDefault 

Default option, simply calculate the site entropy using the characters used in the SiteCounts object.

kIncludeGaps 

In addition to the characters of the SiteCounts object, use the undetermined and gap characters.

With this option, an additional term is added to the entropy, using the "rest" probability of the site. The counts of all characters at a site in the SiteCounts do not always add up to the number of sequences that have been added. In cases where a Sequence contains gaps or characters that are not set in the SiteCounts object, those counts are simply ignored when processing the Sequence and counting its sites.

Using this ignored rest, an additional entropy term is calculated and added to the total entropy, if this option is used.

For example, if the SiteCounts object counts all occurences of A, C, G and T, and a Sequence added to it contains a gap - or a degenerated character like W, those character are ignored in the counts. The total number of those characters is then:

auto gap_count = counts.added_sequences_count()
               - counts.count_of( 'A', site_idx )
               - counts.count_of( 'C', site_idx )
               - counts.count_of( 'G', site_idx )
               - counts.count_of( 'T', site_idx );

This number is then used as an additional summand for the entropy: \( H_{i} \) += \( -f_{{-,i}}\times \log _{2}f_{{-,i}} \), with \( f_{-,i} = \) gap_count/counts.added_sequences_count() being the relative frequency of those gaps and undetermined characters.

kWeighted 

Weight the entropy using the summed relative frequencies of the characters.

The entropy per site depends on the frequencies of the chracters. However, per default, gaps and other undetermined characters (those which are not used in the SiteCounts object) are igonred. Thus, the entropy for sites that contain mostly gaps might still have quite a high value. In cases where mostly-gap sites shall add little to the total entropy, this option can be used to reduce their influence. The site_entropy() is then weighted using the sum of the frequencies of the determined sites.

For example, if a site has fractions of 0.1 for all four characters A, C, G and T, the resulting entropy is weighted with a factor of 0.4.

See also SiteEntropyOptions::kIncludeGaps for a related option to take those into accoount.

kNormalized 

Normalize the resulting entropy using the maximum entropy possible.

This option results in entropy values in the range [ 0.0, 1.0 ]. This is achieved by dividing the entropy by the maxmimal value that is possible given the used characters of the SiteCounts object. For example, with the four characters A, C, G and T, the entropy is divided by \( log_2{(4)} \).

If additionally the SiteEntropyOptions::kIncludeGaps flag is set, the divisor is calculated using one additional value, e.g., \( log_2{(4+1)} \).

Definition at line 70 of file sequence/functions/entropy.hpp.

Variable Documentation

◆ amino_acid_code_to_name

const std::unordered_map<char, std::string> amino_acid_code_to_name
static

Definition at line 77 of file codes.cpp.

◆ amino_acid_colors_map

const std::map<char, utils::Color> amino_acid_colors_map
static

Definition at line 201 of file codes.cpp.

◆ amino_acid_text_colors_map

const std::map<char, std::string> amino_acid_text_colors_map
static

Definition at line 141 of file codes.cpp.

◆ nucleic_acid_ambiguity_char_map

const std::unordered_map<char, std::string> nucleic_acid_ambiguity_char_map
static
Initial value:
= {
{ 'A', "A" },
{ 'C', "C" },
{ 'G', "G" },
{ 'T', "T" },
{ 'U', "T" },
{ 'W', "AT" },
{ 'S', "CG" },
{ 'M', "AC" },
{ 'K', "GT" },
{ 'R', "AG" },
{ 'Y', "CT" },
{ 'B', "CGT" },
{ 'D', "AGT" },
{ 'H', "ACT" },
{ 'V', "ACG" },
{ 'N', "ACGT" },
{ 'O', "-" },
{ 'X', "-" },
{ '.', "-" },
{ '-', "-" },
{ '?', "-" }
}

Definition at line 238 of file codes.cpp.

◆ nucleic_acid_ambiguity_string_map

const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
static
Initial value:
= {
{ "A", 'A' },
{ "C", 'C' },
{ "G", 'G' },
{ "T", 'T' },
{ "AT", 'W' },
{ "CG", 'S' },
{ "AC", 'M' },
{ "GT", 'K' },
{ "AG", 'R' },
{ "CT", 'Y' },
{ "CGT", 'B' },
{ "AGT", 'D' },
{ "ACT", 'H' },
{ "ACG", 'V' },
{ "ACGT", 'N' },
{ "-", '-' }
}

Definition at line 265 of file codes.cpp.

◆ nucleic_acid_code_to_name

const std::unordered_map<char, std::string> nucleic_acid_code_to_name
static
Initial value:
= {
{ 'A', "Adenine" },
{ 'C', "Cytosine" },
{ 'G', "Guanine" },
{ 'T', "Thymine" },
{ 'U', "Uracil" },
{ 'W', "Weak" },
{ 'S', "Strong" },
{ 'M', "aMino" },
{ 'K', "Keto" },
{ 'R', "puRine" },
{ 'Y', "pYrimidine" },
{ 'B', "not A" },
{ 'D', "not C" },
{ 'H', "not G" },
{ 'V', "not T" },
{ 'N', "any" },
{ 'O', "omitted" },
{ 'X', "masked" },
{ '.', "gap" },
{ '-', "gap" },
{ '?', "gap" }
}

Definition at line 50 of file codes.cpp.

◆ nucleic_acid_colors_map

const std::map<char, utils::Color> nucleic_acid_colors_map
static
Initial value:
= {
{ 'A', { 1.0, 0.0, 0.0 } },
{ 'C', { 0.0, 1.0, 0.0 } },
{ 'G', { 1.0, 1.0, 0.0 } },
{ 'T', { 0.0, 0.0, 1.0 } },
{ 'U', { 0.0, 0.0, 1.0 } },
{ 'W', { 0.376, 0.376, 0.376 } },
{ 'S', { 0.376, 0.376, 0.376 } },
{ 'M', { 0.376, 0.376, 0.376 } },
{ 'K', { 0.376, 0.376, 0.376 } },
{ 'R', { 0.376, 0.376, 0.376 } },
{ 'Y', { 0.376, 0.376, 0.376 } },
{ 'B', { 0.5, 0.5, 0.5 } },
{ 'D', { 0.5, 0.5, 0.5 } },
{ 'H', { 0.5, 0.5, 0.5 } },
{ 'V', { 0.5, 0.5, 0.5 } },
{ 'N', { 0.5, 0.5, 0.5 } },
{ 'O', { 0.5, 0.5, 0.5 } },
{ 'X', { 0.5, 0.5, 0.5 } },
{ '.', { 0.5, 0.5, 0.5 } },
{ '-', { 0.5, 0.5, 0.5 } },
{ '?', { 0.5, 0.5, 0.5 } }
}

Definition at line 174 of file codes.cpp.

◆ nucleic_acid_text_colors_map

const std::map<char, std::string> nucleic_acid_text_colors_map
static
Initial value:
= {
{ 'A', "Red" },
{ 'C', "Green" },
{ 'G', "Yellow" },
{ 'T', "Blue" },
{ 'U', "Blue" },
{ 'W', "DarkGray" },
{ 'S', "DarkGray" },
{ 'M', "DarkGray" },
{ 'K', "DarkGray" },
{ 'R', "DarkGray" },
{ 'Y', "DarkGray" },
{ 'B', "DarkGray" },
{ 'D', "DarkGray" },
{ 'H', "DarkGray" },
{ 'V', "DarkGray" },
{ 'N', "DarkGray" },
{ 'O', "DarkGray" },
{ 'X', "DarkGray" },
{ '.', "DarkGray" },
{ '-', "DarkGray" },
{ '?', "DarkGray" }
}

Definition at line 114 of file codes.cpp.

◆ phred_score_to_error_probability_lookup_

const std::array<double, 256> phred_score_to_error_probability_lookup_
static

Definition at line 82 of file quality.cpp.