A toolkit for working with phylogenetic data.
v0.24.0
codes.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cctype>
40 #include <stdexcept>
41 #include <unordered_map>
42 
43 namespace genesis {
44 namespace sequence {
45 
46 // =================================================================================================
47 // Name Lists
48 // =================================================================================================
49 
50 static const std::unordered_map<char, std::string> nucleic_acid_code_to_name = {
51  { 'A', "Adenine" },
52  { 'C', "Cytosine" },
53  { 'G', "Guanine" },
54  { 'T', "Thymine" },
55  { 'U', "Uracil" },
56 
57  { 'W', "Weak" },
58  { 'S', "Strong" },
59  { 'M', "aMino" },
60  { 'K', "Keto" },
61  { 'R', "puRine" },
62  { 'Y', "pYrimidine" },
63 
64  { 'B', "not A" },
65  { 'D', "not C" },
66  { 'H', "not G" },
67  { 'V', "not T" },
68 
69  { 'N', "any" },
70  { 'O', "omitted" },
71  { 'X', "masked" },
72  { '.', "gap" },
73  { '-', "gap" },
74  { '?', "gap" }
75 };
76 
77 static const std::unordered_map<char, std::string> amino_acid_code_to_name = {
78  { 'A', "Alanine" },
79  { 'B', "Aspartic acid or Asparagine" },
80  { 'C', "Cysteine" },
81  { 'D', "Aspartic acid" },
82  { 'E', "Glutamic acid" },
83  { 'F', "Phenylalanine" },
84  { 'G', "Glycine" },
85  { 'H', "Histidine" },
86  { 'I', "Isoleucine" },
87  { 'J', "Leucine or Isoleucine" },
88  { 'K', "Lysine" },
89  { 'L', "Leucine" },
90  { 'M', "Methionine" },
91  { 'N', "Asparagine" },
92  { 'O', "Pyrrolysine" },
93  { 'P', "Proline" },
94  { 'Q', "Glutamine" },
95  { 'R', "Arginine" },
96  { 'S', "Serine" },
97  { 'T', "Threonine" },
98  { 'U', "Selenocysteine" },
99  { 'V', "Valine" },
100  { 'W', "Tryptophan" },
101  { 'Y', "Tyrosine" },
102  { 'Z', "Glutamic acid or Glutamine" },
103 
104  { 'X', "any" },
105  { '*', "translation stop" },
106  { '-', "gap" },
107  { '?', "gap" }
108 };
109 
110 // =================================================================================================
111 // Color Lists
112 // =================================================================================================
113 
114 static const std::map<char, std::string> nucleic_acid_text_colors_map = {
115  { 'A', "Red" },
116  { 'C', "Green" },
117  { 'G', "Yellow" },
118  { 'T', "Blue" },
119  { 'U', "Blue" },
120 
121  { 'W', "DarkGray" },
122  { 'S', "DarkGray" },
123  { 'M', "DarkGray" },
124  { 'K', "DarkGray" },
125  { 'R', "DarkGray" },
126  { 'Y', "DarkGray" },
127 
128  { 'B', "DarkGray" },
129  { 'D', "DarkGray" },
130  { 'H', "DarkGray" },
131  { 'V', "DarkGray" },
132 
133  { 'N', "DarkGray" },
134  { 'O', "DarkGray" },
135  { 'X', "DarkGray" },
136  { '.', "DarkGray" },
137  { '-', "DarkGray" },
138  { '?', "DarkGray" }
139 };
140 
141 static const std::map<char, std::string> amino_acid_text_colors_map = {
142  { 'A', "Blue" },
143  { 'B', "DarkGray" },
144  { 'C', "LightMagenta" },
145  { 'D', "Magenta" },
146  { 'E', "Magenta" },
147  { 'F', "Blue" },
148  { 'G', "LightRed" },
149  { 'H', "Cyan" },
150  { 'I', "Blue" },
151  { 'J', "DarkGray" },
152  { 'K', "Red" },
153  { 'L', "Blue" },
154  { 'M', "Blue" },
155  { 'N', "Green" },
156  { 'O', "DarkGray" },
157  { 'P', "Yellow" },
158  { 'Q', "Green" },
159  { 'R', "Red" },
160  { 'S', "Green" },
161  { 'T', "Green" },
162  { 'U', "DarkGray" },
163  { 'V', "Blue" },
164  { 'W', "Blue" },
165  { 'Y', "Cyan" },
166  { 'Z', "DarkGray" },
167 
168  { 'X', "DarkGray" },
169  { '*', "DarkGray" },
170  { '-', "DarkGray" },
171  { '?', "DarkGray" }
172 };
173 
174 static const std::map<char, utils::Color> nucleic_acid_colors_map = {
175  { 'A', { 1.0, 0.0, 0.0 } },
176  { 'C', { 0.0, 1.0, 0.0 } },
177  { 'G', { 1.0, 1.0, 0.0 } },
178  { 'T', { 0.0, 0.0, 1.0 } },
179  { 'U', { 0.0, 0.0, 1.0 } },
180 
181  { 'W', { 0.376, 0.376, 0.376 } },
182  { 'S', { 0.376, 0.376, 0.376 } },
183  { 'M', { 0.376, 0.376, 0.376 } },
184  { 'K', { 0.376, 0.376, 0.376 } },
185  { 'R', { 0.376, 0.376, 0.376 } },
186  { 'Y', { 0.376, 0.376, 0.376 } },
187 
188  { 'B', { 0.5, 0.5, 0.5 } },
189  { 'D', { 0.5, 0.5, 0.5 } },
190  { 'H', { 0.5, 0.5, 0.5 } },
191  { 'V', { 0.5, 0.5, 0.5 } },
192 
193  { 'N', { 0.5, 0.5, 0.5 } },
194  { 'O', { 0.5, 0.5, 0.5 } },
195  { 'X', { 0.5, 0.5, 0.5 } },
196  { '.', { 0.5, 0.5, 0.5 } },
197  { '-', { 0.5, 0.5, 0.5 } },
198  { '?', { 0.5, 0.5, 0.5 } }
199 };
200 
201 static const std::map<char, utils::Color> amino_acid_colors_map = {
202  { 'A', { 0.098, 0.500, 1.000 } },
203  { 'B', { 0.376, 0.376, 0.376 } },
204  { 'C', { 0.902, 0.500, 0.500 } },
205  { 'D', { 0.800, 0.302, 0.800 } },
206  { 'E', { 0.800, 0.302, 0.800 } },
207  { 'F', { 0.098, 0.500, 1.000 } },
208  { 'G', { 0.902, 0.600, 0.302 } },
209  { 'H', { 0.098, 0.702, 0.702 } },
210  { 'I', { 0.098, 0.500, 1.000 } },
211  { 'J', { 0.376, 0.376, 0.376 } },
212  { 'K', { 0.902, 0.200, 0.098 } },
213  { 'L', { 0.098, 0.500, 1.000 } },
214  { 'M', { 0.098, 0.500, 1.000 } },
215  { 'N', { 0.098, 0.800, 0.098 } },
216  { 'O', { 0.376, 0.376, 0.376 } },
217  { 'P', { 0.800, 0.800, 0.000 } },
218  { 'Q', { 0.098, 0.800, 0.098 } },
219  { 'R', { 0.902, 0.200, 0.098 } },
220  { 'S', { 0.098, 0.800, 0.098 } },
221  { 'T', { 0.098, 0.800, 0.098 } },
222  { 'U', { 0.376, 0.376, 0.376 } },
223  { 'V', { 0.098, 0.500, 1.000 } },
224  { 'W', { 0.098, 0.500, 1.000 } },
225  { 'Y', { 0.098, 0.702, 0.702 } },
226  { 'Z', { 0.376, 0.376, 0.376 } },
227 
228  { 'X', { 0.5, 0.5, 0.5 } },
229  { '*', { 0.5, 0.5, 0.5 } },
230  { '-', { 0.5, 0.5, 0.5 } },
231  { '?', { 0.5, 0.5, 0.5 } }
232 };
233 
234 // =================================================================================================
235 // Ambiguity Lists
236 // =================================================================================================
237 
238 static const std::unordered_map<char, std::string> nucleic_acid_ambiguity_char_map = {
239  { 'A', "A" },
240  { 'C', "C" },
241  { 'G', "G" },
242  { 'T', "T" },
243  { 'U', "T" },
244 
245  { 'W', "AT" },
246  { 'S', "CG" },
247  { 'M', "AC" },
248  { 'K', "GT" },
249  { 'R', "AG" },
250  { 'Y', "CT" },
251 
252  { 'B', "CGT" },
253  { 'D', "AGT" },
254  { 'H', "ACT" },
255  { 'V', "ACG" },
256 
257  { 'N', "ACGT" },
258  { 'O', "-" },
259  { 'X', "-" },
260  { '.', "-" },
261  { '-', "-" },
262  { '?', "-" }
263 };
264 
265 static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map = {
266  { "A", 'A' },
267  { "C", 'C' },
268  { "G", 'G' },
269  { "T", 'T' },
270 
271  { "AT", 'W' },
272  { "CG", 'S' },
273  { "AC", 'M' },
274  { "GT", 'K' },
275  { "AG", 'R' },
276  { "CT", 'Y' },
277 
278  { "CGT", 'B' },
279  { "AGT", 'D' },
280  { "ACT", 'H' },
281  { "ACG", 'V' },
282 
283  { "ACGT", 'N' },
284  { "-", '-' }
285 };
286 
287 // =================================================================================================
288 // Codes
289 // =================================================================================================
290 
291 // ---------------------------------------------------------------------
292 // Nucleic Acids
293 // ---------------------------------------------------------------------
294 
296 {
297  return "ACGT";
298 }
299 
301 {
302  return "WSMKRYBDHV";
303 }
304 
306 {
307  return "NOX.-?";
308 }
309 
311 {
312  return nucleic_acid_codes_plain()
315 }
316 
317 // ---------------------------------------------------------------------
318 // Amino Acids
319 // ---------------------------------------------------------------------
320 
322 {
323  return "ACDEFGHIKLMNOPQRSTUVWY";
324 }
325 
327 {
328  return "BJZ";
329 }
330 
332 {
333  return "X*-?";
334 }
335 
336 std::string amino_acid_codes_all()
337 {
338  return amino_acid_codes_plain()
341 }
342 
343 // ---------------------------------------------------------------------
344 // Misc
345 // ---------------------------------------------------------------------
346 
347 std::string normalize_code_alphabet( std::string const& alphabet )
348 {
349  // Uppercase, sort, uniq the alphabet.
350  auto normalized = utils::to_upper( alphabet );
351  std::sort( normalized.begin(), normalized.end() );
352  normalized.erase( std::unique( normalized.begin(), normalized.end() ), normalized.end() );
353  return normalized;
354 }
355 
356 char normalize_nucleic_acid_code( char code, bool accept_degenerated )
357 {
358  switch( code ) {
359  case 'u':
360  case 'U':
361  case 't':
362  case 'T':
363  return 'T';
364  case 'a':
365  case 'A':
366  return 'A';
367  case 'c':
368  case 'C':
369  return 'C';
370  case 'g':
371  case 'G':
372  return 'G';
373  case 'w':
374  case 'W':
375  case 's':
376  case 'S':
377  case 'm':
378  case 'M':
379  case 'k':
380  case 'K':
381  case 'r':
382  case 'R':
383  case 'y':
384  case 'Y':
385  case 'b':
386  case 'B':
387  case 'd':
388  case 'D':
389  case 'h':
390  case 'H':
391  case 'v':
392  case 'V':
393  if( accept_degenerated ) {
394  return utils::to_upper( code );
395  } else {
396  throw std::invalid_argument(
397  "Degenerated nucleic acid code not accepted: " + std::string( 1, code )
398  );
399  }
400  case 'n':
401  case 'N':
402  case 'o':
403  case 'O':
404  case 'x':
405  case 'X':
406  case '.':
407  case '-':
408  case '?':
409  return '-';
410  default:
411  throw std::invalid_argument( "Not a nucleic acid code: " + std::string( 1, code ) );
412  }
413 }
414 
415 char normalize_amino_acid_code( char code, bool accept_degenerated )
416 {
417  switch( code ) {
418  case 'A':
419  case 'C':
420  case 'D':
421  case 'E':
422  case 'F':
423  case 'G':
424  case 'H':
425  case 'I':
426  case 'K':
427  case 'L':
428  case 'M':
429  case 'N':
430  case 'O':
431  case 'P':
432  case 'Q':
433  case 'R':
434  case 'S':
435  case 'T':
436  case 'U':
437  case 'V':
438  case 'W':
439  case 'Y':
440  return code;
441  case 'a':
442  case 'c':
443  case 'd':
444  case 'e':
445  case 'f':
446  case 'g':
447  case 'h':
448  case 'i':
449  case 'k':
450  case 'l':
451  case 'm':
452  case 'n':
453  case 'o':
454  case 'p':
455  case 'q':
456  case 'r':
457  case 's':
458  case 't':
459  case 'u':
460  case 'v':
461  case 'w':
462  case 'y':
463  return utils::to_upper( code );
464  case 'B':
465  case 'J':
466  case 'Z':
467  case 'b':
468  case 'j':
469  case 'z':
470  if( accept_degenerated ) {
471  return utils::to_upper( code );
472  } else {
473  throw std::invalid_argument(
474  "Degenerated amino acid code not accepted: " + std::string( 1, code )
475  );
476  }
477  case 'X':
478  case 'x':
479  case '*':
480  case '-':
481  case '?':
482  return '-';
483  default:
484  throw std::invalid_argument( "Not an amino acid code: " + std::string( 1, code ) );
485  }
486 }
487 
488 std::string reverse_complement( std::string const& sequence, bool accept_degenerated )
489 {
490  // Dummy result string.
491  auto result = std::string( sequence.size(), '-' );
492 
493  // Get rev comp char. We only need to check upper case as we normalized before.
494  auto rev_comp = []( char c ){
495  switch( c ) {
496  case 'A':
497  return 'T';
498  case 'C':
499  return 'G';
500  case 'G':
501  return 'C';
502  case 'T':
503  return 'A';
504 
505  case 'W':
506  return 'W';
507  case 'S':
508  return 'S';
509  case 'M':
510  return 'K';
511  case 'K':
512  return 'M';
513  case 'R':
514  return 'Y';
515  case 'Y':
516  return 'R';
517 
518  case 'B':
519  return 'V';
520  case 'D':
521  return 'H';
522  case 'H':
523  return 'D';
524  case 'V':
525  return 'B';
526 
527  default:
528  // We already checked for invalid chars in the normalize function.
529  // Just do this to be safe.
530  assert( false );
531  throw std::invalid_argument( "Not a nucleic acid code: " + std::string( 1, c ) );
532  }
533  };
534 
535  // Stupid and simple.
536  for( size_t i = 0; i < sequence.size(); ++i ) {
537  char c = sequence[i];
538 
539  // Slighly hacky: the normalize function turns 'N' into '-'.
540  // We don't want that here, so we have to treat that special case.
541  if( c == 'n' || c == 'N' ) {
542  if( accept_degenerated ) {
543  result[ sequence.size() - i - 1 ] = 'N';
544  continue;
545  } else {
546  throw std::invalid_argument(
547  "Degenerated nucleic acid code not accepted: " + std::string( 1, c )
548  );
549  }
550  }
551 
552  // First normalize, then reverse. Slighly inefficition, but saves code duplication.
553  c = normalize_nucleic_acid_code( c, accept_degenerated );
554  result[ sequence.size() - i - 1 ] = rev_comp( c );
555  }
556  return result;
557 }
558 
559 bool nucleic_acid_code_containment( char a, char b, bool undetermined_matches_all )
560 {
561  // This is slightly bad, because we are not actually encoding binary,
562  // but using hex here (for simplicity of writing...). Should still work as expected.
563  auto binary_code_ = [ undetermined_matches_all ]( char c ){
564  switch( c ) {
565  case 'A':
566  return 0x0001;
567  case 'C':
568  return 0x0010;
569  case 'G':
570  return 0x0100;
571  case 'T':
572  return 0x1000;
573 
574  case 'W':
575  return 0x1001;
576  case 'S':
577  return 0x0110;
578  case 'M':
579  return 0x0011;
580  case 'K':
581  return 0x1100;
582  case 'R':
583  return 0x0101;
584  case 'Y':
585  return 0x1010;
586 
587  case 'B':
588  return 0x1110;
589  case 'D':
590  return 0x1101;
591  case 'H':
592  return 0x1011;
593  case 'V':
594  return 0x0111;
595 
596  case '-': {
597  if( undetermined_matches_all ) {
598  return 0x1111;
599  } else {
600  return 0x0000;
601  }
602  }
603 
604  default:
605  // We already checked for invalid chars in the normalize function.
606  // Just do this to be safe.
607  assert( false );
608  throw std::invalid_argument( "Not a nucleic acid code: " + std::string( 1, c ) );
609  }
610  };
611 
612  auto const an = normalize_nucleic_acid_code( a, true );
613  auto const bn = normalize_nucleic_acid_code( b, true );
614 
615  auto const ab = binary_code_( an );
616  auto const bb = binary_code_( bn );
617 
618  return ( ab & bb ) > 0;
619 }
620 
621 // =================================================================================================
622 // Color Codes
623 // =================================================================================================
624 
625 std::map<char, std::string> nucleic_acid_text_colors()
626 {
628 }
629 
630 std::map<char, std::string> amino_acid_text_colors()
631 {
633 }
634 
635 std::map<char, utils::Color> nucleic_acid_colors()
636 {
638 }
639 
640 std::map<char, utils::Color> amino_acid_colors()
641 {
642  return amino_acid_colors_map;
643 }
644 
645 // =================================================================================================
646 // Translate Codes
647 // =================================================================================================
648 
649 std::string nucleic_acid_name( char code )
650 {
651  auto ucode = toupper(code);
652  if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
653  throw std::out_of_range( "Invalid nucleic acid code '" + std::string( 1, code ) + "'." );
654  }
655  return nucleic_acid_code_to_name.at( ucode );
656 }
657 
658 std::string amino_acid_name( char code )
659 {
660  auto ucode = toupper(code);
661  if( amino_acid_code_to_name.count( ucode ) == 0 ) {
662  throw std::out_of_range( "Invalid amino acid code '" + std::string( 1, code ) + "'." );
663  }
664  return amino_acid_code_to_name.at( ucode );
665 }
666 
667 std::string nucleic_acid_ambiguities( char code )
668 {
669  auto ucode = toupper(code);
670  if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
671  throw std::out_of_range( "Invalid nucleic acid code '" + std::string( 1, code ) + "'." );
672  }
673  return nucleic_acid_ambiguity_char_map.at( ucode );
674 }
675 
676 char nucleic_acid_ambiguity_code( std::string codes )
677 {
678  // Uppercase, sort, uniq the codes.
679  auto tmp = utils::to_upper( codes );
680  std::sort( tmp.begin(), tmp.end() );
681  tmp.erase( std::unique( tmp.begin(), tmp.end() ), tmp.end() );
682 
683  if( nucleic_acid_ambiguity_string_map.count( tmp ) == 0 ) {
684  throw std::out_of_range( "Invalid nucleic acid codes '" + codes + "'." );
685  }
686  return nucleic_acid_ambiguity_string_map.at( tmp );
687 }
688 
689 } // namespace sequence
690 } // namespace genesis
std::string amino_acid_codes_all()
Return all valid amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWYBJZX*-?".
Definition: codes.cpp:336
static const std::map< char, std::string > nucleic_acid_text_colors_map
Definition: codes.cpp:114
std::string amino_acid_codes_degenerated()
Return all degenerated amino acid codes. Those are "BJZ".
Definition: codes.cpp:326
std::string amino_acid_codes_plain()
Return all plain amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWY".
Definition: codes.cpp:321
std::string nucleic_acid_codes_all()
Return all valid nucleic acid codes. Those are "ACGTUWSMKRYBDHVNOX.-?".
Definition: codes.cpp:310
std::string nucleic_acid_name(char code)
Get the name of a nucleic acid given its IUPAC code.
Definition: codes.cpp:649
static const std::unordered_map< char, std::string > nucleic_acid_code_to_name
Definition: codes.cpp:50
std::string normalize_code_alphabet(std::string const &alphabet)
Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.
Definition: codes.cpp:347
std::string nucleic_acid_codes_plain()
Return all plain nucleic acid codes. Those are "ACGTU".
Definition: codes.cpp:295
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
Header of Color class.
std::string nucleic_acid_codes_degenerated()
Return all degenerated nucleic acid codes. Those are "WSMKRYBDHV".
Definition: codes.cpp:300
std::string nucleic_acid_codes_undetermined()
Return all undetermined nucleic acid codes. Those are "NOX.-?".
Definition: codes.cpp:305
std::map< char, std::string > nucleic_acid_text_colors()
Return a map of text colors for each nucleic acid code.
Definition: codes.cpp:625
std::string reverse_complement(std::string const &sequence, bool accept_degenerated)
Get the reverse complement of a nucleic acid sequence.
Definition: codes.cpp:488
std::string amino_acid_name(char code)
Get the name of a amino acid given its IUPAC code.
Definition: codes.cpp:658
static const std::unordered_map< char, std::string > amino_acid_code_to_name
Definition: codes.cpp:77
char normalize_amino_acid_code(char code, bool accept_degenerated)
Normalize an amino acid code.
Definition: codes.cpp:415
static const std::map< char, utils::Color > nucleic_acid_colors_map
Definition: codes.cpp:174
Provides some commonly used string utility functions.
static const std::unordered_map< char, std::string > nucleic_acid_ambiguity_char_map
Definition: codes.cpp:238
std::map< char, utils::Color > amino_acid_colors()
Return a map of Colors for each amino acid code.
Definition: codes.cpp:640
static const std::map< char, std::string > amino_acid_text_colors_map
Definition: codes.cpp:141
std::map< char, std::string > amino_acid_text_colors()
Return a map of text colors for each amino acid code.
Definition: codes.cpp:630
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:227
char normalize_nucleic_acid_code(char code, bool accept_degenerated)
Normalize a nucleic acide code.
Definition: codes.cpp:356
static const std::map< char, utils::Color > amino_acid_colors_map
Definition: codes.cpp:201
bool nucleic_acid_code_containment(char a, char b, bool undetermined_matches_all)
Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters i...
Definition: codes.cpp:559
std::string nucleic_acid_ambiguities(char code)
Return the possible ambiguous nucleic acid codes for a given code char.
Definition: codes.cpp:667
std::string amino_acid_codes_undetermined()
Return all undetermined amino acid codes. Those are "X*-?".
Definition: codes.cpp:331
static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
Definition: codes.cpp:265
std::map< char, utils::Color > nucleic_acid_colors()
Return a map of Colors for each nucleic acid code.
Definition: codes.cpp:635
char nucleic_acid_ambiguity_code(std::string codes)
Return the nucleic acid code that represents all given codes.
Definition: codes.cpp:676