41 #include <unordered_map> 62 {
'Y',
"pYrimidine" },
79 {
'B',
"Aspartic acid or Asparagine" },
81 {
'D',
"Aspartic acid" },
82 {
'E',
"Glutamic acid" },
83 {
'F',
"Phenylalanine" },
86 {
'I',
"Isoleucine" },
87 {
'J',
"Leucine or Isoleucine" },
90 {
'M',
"Methionine" },
91 {
'N',
"Asparagine" },
92 {
'O',
"Pyrrolysine" },
98 {
'U',
"Selenocysteine" },
100 {
'W',
"Tryptophan" },
102 {
'Z',
"Glutamic acid or Glutamine" },
105 {
'*',
"translation stop" },
144 {
'C',
"LightMagenta" },
175 {
'A', { 1.0, 0.0, 0.0 } },
176 {
'C', { 0.0, 1.0, 0.0 } },
177 {
'G', { 1.0, 1.0, 0.0 } },
178 {
'T', { 0.0, 0.0, 1.0 } },
179 {
'U', { 0.0, 0.0, 1.0 } },
181 {
'W', { 0.376, 0.376, 0.376 } },
182 {
'S', { 0.376, 0.376, 0.376 } },
183 {
'M', { 0.376, 0.376, 0.376 } },
184 {
'K', { 0.376, 0.376, 0.376 } },
185 {
'R', { 0.376, 0.376, 0.376 } },
186 {
'Y', { 0.376, 0.376, 0.376 } },
188 {
'B', { 0.5, 0.5, 0.5 } },
189 {
'D', { 0.5, 0.5, 0.5 } },
190 {
'H', { 0.5, 0.5, 0.5 } },
191 {
'V', { 0.5, 0.5, 0.5 } },
193 {
'N', { 0.5, 0.5, 0.5 } },
194 {
'O', { 0.5, 0.5, 0.5 } },
195 {
'X', { 0.5, 0.5, 0.5 } },
196 {
'.', { 0.5, 0.5, 0.5 } },
197 {
'-', { 0.5, 0.5, 0.5 } },
198 {
'?', { 0.5, 0.5, 0.5 } }
202 {
'A', { 0.098, 0.500, 1.000 } },
203 {
'B', { 0.376, 0.376, 0.376 } },
204 {
'C', { 0.902, 0.500, 0.500 } },
205 {
'D', { 0.800, 0.302, 0.800 } },
206 {
'E', { 0.800, 0.302, 0.800 } },
207 {
'F', { 0.098, 0.500, 1.000 } },
208 {
'G', { 0.902, 0.600, 0.302 } },
209 {
'H', { 0.098, 0.702, 0.702 } },
210 {
'I', { 0.098, 0.500, 1.000 } },
211 {
'J', { 0.376, 0.376, 0.376 } },
212 {
'K', { 0.902, 0.200, 0.098 } },
213 {
'L', { 0.098, 0.500, 1.000 } },
214 {
'M', { 0.098, 0.500, 1.000 } },
215 {
'N', { 0.098, 0.800, 0.098 } },
216 {
'O', { 0.376, 0.376, 0.376 } },
217 {
'P', { 0.800, 0.800, 0.000 } },
218 {
'Q', { 0.098, 0.800, 0.098 } },
219 {
'R', { 0.902, 0.200, 0.098 } },
220 {
'S', { 0.098, 0.800, 0.098 } },
221 {
'T', { 0.098, 0.800, 0.098 } },
222 {
'U', { 0.376, 0.376, 0.376 } },
223 {
'V', { 0.098, 0.500, 1.000 } },
224 {
'W', { 0.098, 0.500, 1.000 } },
225 {
'Y', { 0.098, 0.702, 0.702 } },
226 {
'Z', { 0.376, 0.376, 0.376 } },
228 {
'X', { 0.5, 0.5, 0.5 } },
229 {
'*', { 0.5, 0.5, 0.5 } },
230 {
'-', { 0.5, 0.5, 0.5 } },
231 {
'?', { 0.5, 0.5, 0.5 } }
323 return "ACDEFGHIKLMNOPQRSTUVWY";
351 std::sort( normalized.begin(), normalized.end() );
352 normalized.erase( std::unique( normalized.begin(), normalized.end() ), normalized.end() );
393 if( accept_degenerated ) {
396 throw std::invalid_argument(
397 "Degenerated nucleic acid code not accepted: " + std::string( 1, code )
411 throw std::invalid_argument(
"Not a nucleic acid code: " + std::string( 1, code ) );
470 if( accept_degenerated ) {
473 throw std::invalid_argument(
474 "Degenerated amino acid code not accepted: " + std::string( 1, code )
484 throw std::invalid_argument(
"Not an amino acid code: " + std::string( 1, code ) );
491 auto result = std::string( sequence.size(),
'-' );
494 auto rev_comp = [](
char c ){
531 throw std::invalid_argument(
"Not a nucleic acid code: " + std::string( 1, c ) );
536 for(
size_t i = 0; i < sequence.size(); ++i ) {
537 char c = sequence[i];
541 if( c ==
'n' || c ==
'N' ) {
542 if( accept_degenerated ) {
543 result[ sequence.size() - i - 1 ] =
'N';
546 throw std::invalid_argument(
547 "Degenerated nucleic acid code not accepted: " + std::string( 1, c )
554 result[ sequence.size() - i - 1 ] = rev_comp( c );
563 auto binary_code_ = [ undetermined_matches_all ](
char c ){
597 if( undetermined_matches_all ) {
608 throw std::invalid_argument(
"Not a nucleic acid code: " + std::string( 1, c ) );
615 auto const ab = binary_code_( an );
616 auto const bb = binary_code_( bn );
618 return ( ab & bb ) > 0;
651 auto ucode = toupper(code);
652 if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
653 throw std::out_of_range(
"Invalid nucleic acid code '" + std::string( 1, code ) +
"'." );
655 return nucleic_acid_code_to_name.at( ucode );
660 auto ucode = toupper(code);
661 if( amino_acid_code_to_name.count( ucode ) == 0 ) {
662 throw std::out_of_range(
"Invalid amino acid code '" + std::string( 1, code ) +
"'." );
664 return amino_acid_code_to_name.at( ucode );
669 auto ucode = toupper(code);
670 if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
671 throw std::out_of_range(
"Invalid nucleic acid code '" + std::string( 1, code ) +
"'." );
673 return nucleic_acid_ambiguity_char_map.at( ucode );
680 std::sort( tmp.begin(), tmp.end() );
681 tmp.erase( std::unique( tmp.begin(), tmp.end() ), tmp.end() );
683 if( nucleic_acid_ambiguity_string_map.count( tmp ) == 0 ) {
684 throw std::out_of_range(
"Invalid nucleic acid codes '" + codes +
"'." );
686 return nucleic_acid_ambiguity_string_map.at( tmp );
std::string amino_acid_codes_all()
Return all valid amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWYBJZX*-?".
static const std::map< char, std::string > nucleic_acid_text_colors_map
std::string amino_acid_codes_degenerated()
Return all degenerated amino acid codes. Those are "BJZ".
std::string amino_acid_codes_plain()
Return all plain amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWY".
std::string nucleic_acid_codes_all()
Return all valid nucleic acid codes. Those are "ACGTUWSMKRYBDHVNOX.-?".
std::string nucleic_acid_name(char code)
Get the name of a nucleic acid given its IUPAC code.
static const std::unordered_map< char, std::string > nucleic_acid_code_to_name
std::string normalize_code_alphabet(std::string const &alphabet)
Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.
std::string nucleic_acid_codes_plain()
Return all plain nucleic acid codes. Those are "ACGTU".
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string nucleic_acid_codes_degenerated()
Return all degenerated nucleic acid codes. Those are "WSMKRYBDHV".
std::string nucleic_acid_codes_undetermined()
Return all undetermined nucleic acid codes. Those are "NOX.-?".
std::map< char, std::string > nucleic_acid_text_colors()
Return a map of text colors for each nucleic acid code.
std::string reverse_complement(std::string const &sequence, bool accept_degenerated)
Get the reverse complement of a nucleic acid sequence.
std::string amino_acid_name(char code)
Get the name of a amino acid given its IUPAC code.
static const std::unordered_map< char, std::string > amino_acid_code_to_name
char normalize_amino_acid_code(char code, bool accept_degenerated)
Normalize an amino acid code.
static const std::map< char, utils::Color > nucleic_acid_colors_map
Provides some commonly used string utility functions.
static const std::unordered_map< char, std::string > nucleic_acid_ambiguity_char_map
std::map< char, utils::Color > amino_acid_colors()
Return a map of Colors for each amino acid code.
static const std::map< char, std::string > amino_acid_text_colors_map
std::map< char, std::string > amino_acid_text_colors()
Return a map of text colors for each amino acid code.
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
char normalize_nucleic_acid_code(char code, bool accept_degenerated)
Normalize a nucleic acide code.
static const std::map< char, utils::Color > amino_acid_colors_map
bool nucleic_acid_code_containment(char a, char b, bool undetermined_matches_all)
Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters i...
std::string nucleic_acid_ambiguities(char code)
Return the possible ambiguous nucleic acid codes for a given code char.
std::string amino_acid_codes_undetermined()
Return all undetermined amino acid codes. Those are "X*-?".
static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
std::map< char, utils::Color > nucleic_acid_colors()
Return a map of Colors for each nucleic acid code.
char nucleic_acid_ambiguity_code(std::string codes)
Return the nucleic acid code that represents all given codes.