A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
codes.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <cctype>
39 #include <stdexcept>
40 #include <unordered_map>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Name Lists
47 // =================================================================================================
48 
49 static const std::unordered_map<char, std::string> nucleic_acid_code_to_name = {
50  { 'A', "Adenine" },
51  { 'C', "Cytosine" },
52  { 'G', "Guanine" },
53  { 'T', "Thymine" },
54  { 'U', "Uracil" },
55 
56  { 'W', "Weak" },
57  { 'S', "Strong" },
58  { 'M', "aMino" },
59  { 'K', "Keto" },
60  { 'R', "puRine" },
61  { 'Y', "pYrimidine" },
62 
63  { 'B', "not A" },
64  { 'D', "not C" },
65  { 'H', "not G" },
66  { 'V', "not T" },
67 
68  { 'N', "any" },
69  { 'O', "omitted" },
70  { 'X', "masked" },
71  { '.', "gap" },
72  { '-', "gap" },
73  { '?', "gap" }
74 };
75 
76 static const std::unordered_map<char, std::string> amino_acid_code_to_name = {
77  { 'A', "Alanine" },
78  { 'B', "Aspartic acid or Asparagine" },
79  { 'C', "Cysteine" },
80  { 'D', "Aspartic acid" },
81  { 'E', "Glutamic acid" },
82  { 'F', "Phenylalanine" },
83  { 'G', "Glycine" },
84  { 'H', "Histidine" },
85  { 'I', "Isoleucine" },
86  { 'J', "Leucine or Isoleucine" },
87  { 'K', "Lysine" },
88  { 'L', "Leucine" },
89  { 'M', "Methionine" },
90  { 'N', "Asparagine" },
91  { 'O', "Pyrrolysine" },
92  { 'P', "Proline" },
93  { 'Q', "Glutamine" },
94  { 'R', "Arginine" },
95  { 'S', "Serine" },
96  { 'T', "Threonine" },
97  { 'U', "Selenocysteine" },
98  { 'V', "Valine" },
99  { 'W', "Tryptophan" },
100  { 'Y', "Tyrosine" },
101  { 'Z', "Glutamic acid or Glutamine" },
102 
103  { 'X', "any" },
104  { '*', "translation stop" },
105  { '-', "gap" },
106  { '?', "gap" }
107 };
108 
109 // =================================================================================================
110 // Color Lists
111 // =================================================================================================
112 
113 static const std::map<char, std::string> nucleic_acid_text_colors_map = {
114  { 'A', "Red" },
115  { 'C', "Green" },
116  { 'G', "Yellow" },
117  { 'T', "Blue" },
118  { 'U', "Blue" },
119 
120  { 'W', "DarkGray" },
121  { 'S', "DarkGray" },
122  { 'M', "DarkGray" },
123  { 'K', "DarkGray" },
124  { 'R', "DarkGray" },
125  { 'Y', "DarkGray" },
126 
127  { 'B', "DarkGray" },
128  { 'D', "DarkGray" },
129  { 'H', "DarkGray" },
130  { 'V', "DarkGray" },
131 
132  { 'N', "DarkGray" },
133  { 'O', "DarkGray" },
134  { 'X', "DarkGray" },
135  { '.', "DarkGray" },
136  { '-', "DarkGray" },
137  { '?', "DarkGray" }
138 };
139 
140 static const std::map<char, std::string> amino_acid_text_colors_map = {
141  { 'A', "Blue" },
142  { 'B', "DarkGray" },
143  { 'C', "LightMagenta" },
144  { 'D', "Magenta" },
145  { 'E', "Magenta" },
146  { 'F', "Blue" },
147  { 'G', "LightRed" },
148  { 'H', "Cyan" },
149  { 'I', "Blue" },
150  { 'J', "DarkGray" },
151  { 'K', "Red" },
152  { 'L', "Blue" },
153  { 'M', "Blue" },
154  { 'N', "Green" },
155  { 'O', "DarkGray" },
156  { 'P', "Yellow" },
157  { 'Q', "Green" },
158  { 'R', "Red" },
159  { 'S', "Green" },
160  { 'T', "Green" },
161  { 'U', "DarkGray" },
162  { 'V', "Blue" },
163  { 'W', "Blue" },
164  { 'Y', "Cyan" },
165  { 'Z', "DarkGray" },
166 
167  { 'X', "DarkGray" },
168  { '*', "DarkGray" },
169  { '-', "DarkGray" },
170  { '?', "DarkGray" }
171 };
172 
173 static const std::map<char, utils::Color> nucleic_acid_colors_map = {
174  { 'A', { 1.0, 0.0, 0.0 } },
175  { 'C', { 0.0, 1.0, 0.0 } },
176  { 'G', { 1.0, 1.0, 0.0 } },
177  { 'T', { 0.0, 0.0, 1.0 } },
178  { 'U', { 0.0, 0.0, 1.0 } },
179 
180  { 'W', { 0.376, 0.376, 0.376 } },
181  { 'S', { 0.376, 0.376, 0.376 } },
182  { 'M', { 0.376, 0.376, 0.376 } },
183  { 'K', { 0.376, 0.376, 0.376 } },
184  { 'R', { 0.376, 0.376, 0.376 } },
185  { 'Y', { 0.376, 0.376, 0.376 } },
186 
187  { 'B', { 0.5, 0.5, 0.5 } },
188  { 'D', { 0.5, 0.5, 0.5 } },
189  { 'H', { 0.5, 0.5, 0.5 } },
190  { 'V', { 0.5, 0.5, 0.5 } },
191 
192  { 'N', { 0.5, 0.5, 0.5 } },
193  { 'O', { 0.5, 0.5, 0.5 } },
194  { 'X', { 0.5, 0.5, 0.5 } },
195  { '.', { 0.5, 0.5, 0.5 } },
196  { '-', { 0.5, 0.5, 0.5 } },
197  { '?', { 0.5, 0.5, 0.5 } }
198 };
199 
200 static const std::map<char, utils::Color> amino_acid_colors_map = {
201  { 'A', { 0.098, 0.500, 1.000 } },
202  { 'B', { 0.376, 0.376, 0.376 } },
203  { 'C', { 0.902, 0.500, 0.500 } },
204  { 'D', { 0.800, 0.302, 0.800 } },
205  { 'E', { 0.800, 0.302, 0.800 } },
206  { 'F', { 0.098, 0.500, 1.000 } },
207  { 'G', { 0.902, 0.600, 0.302 } },
208  { 'H', { 0.098, 0.702, 0.702 } },
209  { 'I', { 0.098, 0.500, 1.000 } },
210  { 'J', { 0.376, 0.376, 0.376 } },
211  { 'K', { 0.902, 0.200, 0.098 } },
212  { 'L', { 0.098, 0.500, 1.000 } },
213  { 'M', { 0.098, 0.500, 1.000 } },
214  { 'N', { 0.098, 0.800, 0.098 } },
215  { 'O', { 0.376, 0.376, 0.376 } },
216  { 'P', { 0.800, 0.800, 0.000 } },
217  { 'Q', { 0.098, 0.800, 0.098 } },
218  { 'R', { 0.902, 0.200, 0.098 } },
219  { 'S', { 0.098, 0.800, 0.098 } },
220  { 'T', { 0.098, 0.800, 0.098 } },
221  { 'U', { 0.376, 0.376, 0.376 } },
222  { 'V', { 0.098, 0.500, 1.000 } },
223  { 'W', { 0.098, 0.500, 1.000 } },
224  { 'Y', { 0.098, 0.702, 0.702 } },
225  { 'Z', { 0.376, 0.376, 0.376 } },
226 
227  { 'X', { 0.5, 0.5, 0.5 } },
228  { '*', { 0.5, 0.5, 0.5 } },
229  { '-', { 0.5, 0.5, 0.5 } },
230  { '?', { 0.5, 0.5, 0.5 } }
231 };
232 
233 // =================================================================================================
234 // Ambiguity Lists
235 // =================================================================================================
236 
237 static const std::unordered_map<char, std::string> nucleic_acid_ambiguity_char_map = {
238  { 'A', "A" },
239  { 'C', "C" },
240  { 'G', "G" },
241  { 'T', "T" },
242  { 'U', "T" },
243 
244  { 'W', "AT" },
245  { 'S', "CG" },
246  { 'M', "AC" },
247  { 'K', "GT" },
248  { 'R', "AG" },
249  { 'Y', "CT" },
250 
251  { 'B', "CGT" },
252  { 'D', "AGT" },
253  { 'H', "ACT" },
254  { 'V', "ACG" },
255 
256  { 'N', "ACGT" },
257  { 'O', "-" },
258  { 'X', "-" },
259  { '.', "-" },
260  { '-', "-" },
261  { '?', "-" }
262 };
263 
264 static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map = {
265  { "A", 'A' },
266  { "C", 'C' },
267  { "G", 'G' },
268  { "T", 'T' },
269 
270  { "AT", 'W' },
271  { "CG", 'S' },
272  { "AC", 'M' },
273  { "GT", 'K' },
274  { "AG", 'R' },
275  { "CT", 'Y' },
276 
277  { "CGT", 'B' },
278  { "AGT", 'D' },
279  { "ACT", 'H' },
280  { "ACG", 'V' },
281 
282  { "ACGT", 'N' },
283  { "-", '-' }
284 };
285 
286 // =================================================================================================
287 // Codes
288 // =================================================================================================
289 
290 // ---------------------------------------------------------------------
291 // Nucleic Acids
292 // ---------------------------------------------------------------------
293 
295 {
296  return "ACGT";
297 }
298 
300 {
301  return "WSMKRYBDHV";
302 }
303 
305 {
306  return "NOX.-?";
307 }
308 
310 {
311  return nucleic_acid_codes_plain()
314 }
315 
316 // ---------------------------------------------------------------------
317 // Amino Acids
318 // ---------------------------------------------------------------------
319 
321 {
322  return "ACDEFGHIKLMNOPQRSTUVWY";
323 }
324 
326 {
327  return "BJZ";
328 }
329 
331 {
332  return "X*-?";
333 }
334 
335 std::string amino_acid_codes_all()
336 {
337  return amino_acid_codes_plain()
340 }
341 
342 // ---------------------------------------------------------------------
343 // Misc
344 // ---------------------------------------------------------------------
345 
346 std::string normalize_code_alphabet( std::string const& alphabet )
347 {
348  // Uppercase, sort, uniq the alphabet.
349  auto normalized = utils::to_upper_ascii( alphabet );
350  std::sort( normalized.begin(), normalized.end() );
351  normalized.erase( std::unique( normalized.begin(), normalized.end() ), normalized.end() );
352  return normalized;
353 }
354 
355 char normalize_nucleic_acid_code( char code, bool accept_degenerated )
356 {
357  switch( code ) {
358  case 'u':
359  case 'U':
360  case 't':
361  case 'T':
362  return 'T';
363  case 'a':
364  case 'A':
365  return 'A';
366  case 'c':
367  case 'C':
368  return 'C';
369  case 'g':
370  case 'G':
371  return 'G';
372  case 'w':
373  case 'W':
374  case 's':
375  case 'S':
376  case 'm':
377  case 'M':
378  case 'k':
379  case 'K':
380  case 'r':
381  case 'R':
382  case 'y':
383  case 'Y':
384  case 'b':
385  case 'B':
386  case 'd':
387  case 'D':
388  case 'h':
389  case 'H':
390  case 'v':
391  case 'V':
392  if( accept_degenerated ) {
393  return utils::to_upper_ascii( code );
394  } else {
395  throw std::invalid_argument(
396  "Degenerated nucleic acid code not accepted: " + std::string( 1, code )
397  );
398  }
399  case 'n':
400  case 'N':
401  case 'o':
402  case 'O':
403  case 'x':
404  case 'X':
405  case '.':
406  case '-':
407  case '?':
408  return '-';
409  default:
410  throw std::invalid_argument( "Not a nucleic acid code: " + std::string( 1, code ) );
411  }
412 }
413 
414 char normalize_amino_acid_code( char code, bool accept_degenerated )
415 {
416  switch( code ) {
417  case 'A':
418  case 'C':
419  case 'D':
420  case 'E':
421  case 'F':
422  case 'G':
423  case 'H':
424  case 'I':
425  case 'K':
426  case 'L':
427  case 'M':
428  case 'N':
429  case 'O':
430  case 'P':
431  case 'Q':
432  case 'R':
433  case 'S':
434  case 'T':
435  case 'U':
436  case 'V':
437  case 'W':
438  case 'Y':
439  return code;
440  case 'a':
441  case 'c':
442  case 'd':
443  case 'e':
444  case 'f':
445  case 'g':
446  case 'h':
447  case 'i':
448  case 'k':
449  case 'l':
450  case 'm':
451  case 'n':
452  case 'o':
453  case 'p':
454  case 'q':
455  case 'r':
456  case 's':
457  case 't':
458  case 'u':
459  case 'v':
460  case 'w':
461  case 'y':
462  return utils::to_upper_ascii( code );
463  case 'B':
464  case 'J':
465  case 'Z':
466  case 'b':
467  case 'j':
468  case 'z':
469  if( accept_degenerated ) {
470  return utils::to_upper_ascii( code );
471  } else {
472  throw std::invalid_argument(
473  "Degenerated amino acid code not accepted: " + std::string( 1, code )
474  );
475  }
476  case 'X':
477  case 'x':
478  case '*':
479  case '-':
480  case '?':
481  return '-';
482  default:
483  throw std::invalid_argument( "Not an amino acid code: " + std::string( 1, code ) );
484  }
485 }
486 
487 std::string reverse_complement( std::string const& sequence, bool accept_degenerated )
488 {
489  // Dummy result string.
490  auto result = std::string( sequence.size(), '-' );
491 
492  // Get rev comp char. We only need to check upper case as we normalized before.
493  auto rev_comp = []( char c ){
494  switch( c ) {
495  case 'A':
496  return 'T';
497  case 'C':
498  return 'G';
499  case 'G':
500  return 'C';
501  case 'T':
502  return 'A';
503 
504  case 'W':
505  return 'W';
506  case 'S':
507  return 'S';
508  case 'M':
509  return 'K';
510  case 'K':
511  return 'M';
512  case 'R':
513  return 'Y';
514  case 'Y':
515  return 'R';
516 
517  case 'B':
518  return 'V';
519  case 'D':
520  return 'H';
521  case 'H':
522  return 'D';
523  case 'V':
524  return 'B';
525 
526  default:
527  // We already checked for invalid chars in the normalize function.
528  // Just do this to be safe.
529  assert( false );
530  throw std::invalid_argument( "Not a nucleic acid code: " + std::string( 1, c ) );
531  }
532  };
533 
534  // Stupid and simple.
535  for( size_t i = 0; i < sequence.size(); ++i ) {
536  char c = sequence[i];
537 
538  // Slighly hacky: the normalize function turns 'N' into '-'.
539  // We don't want that here, so we have to treat that special case.
540  if( c == 'n' || c == 'N' ) {
541  if( accept_degenerated ) {
542  result[ sequence.size() - i - 1 ] = 'N';
543  continue;
544  } else {
545  throw std::invalid_argument(
546  "Degenerated nucleic acid code not accepted: " + std::string( 1, c )
547  );
548  }
549  }
550 
551  // First normalize, then reverse. Slighly inefficition, but saves code duplication.
552  c = normalize_nucleic_acid_code( c, accept_degenerated );
553  result[ sequence.size() - i - 1 ] = rev_comp( c );
554  }
555  return result;
556 }
557 
558 // =================================================================================================
559 // Color Codes
560 // =================================================================================================
561 
562 std::map<char, std::string> nucleic_acid_text_colors()
563 {
565 }
566 
567 std::map<char, std::string> amino_acid_text_colors()
568 {
570 }
571 
572 std::map<char, utils::Color> nucleic_acid_colors()
573 {
575 }
576 
577 std::map<char, utils::Color> amino_acid_colors()
578 {
579  return amino_acid_colors_map;
580 }
581 
582 // =================================================================================================
583 // Translate Codes
584 // =================================================================================================
585 
586 std::string nucleic_acid_name( char code )
587 {
588  auto ucode = toupper(code);
589  if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
590  throw std::out_of_range( "Invalid nucleic acid code '" + std::string( 1, code ) + "'." );
591  }
592  return nucleic_acid_code_to_name.at( ucode );
593 }
594 
595 std::string amino_acid_name( char code )
596 {
597  auto ucode = toupper(code);
598  if( amino_acid_code_to_name.count( ucode ) == 0 ) {
599  throw std::out_of_range( "Invalid amino acid code '" + std::string( 1, code ) + "'." );
600  }
601  return amino_acid_code_to_name.at( ucode );
602 }
603 
604 std::string nucleic_acid_ambiguities( char code )
605 {
606  auto ucode = toupper(code);
607  if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
608  throw std::out_of_range( "Invalid nucleic acid code '" + std::string( 1, code ) + "'." );
609  }
610  return nucleic_acid_ambiguity_char_map.at( ucode );
611 }
612 
613 char nucleic_acid_ambiguity_code( std::string codes )
614 {
615  // Uppercase, sort, uniq the codes.
616  auto tmp = utils::to_upper_ascii( codes );
617  std::sort( tmp.begin(), tmp.end() );
618  tmp.erase( std::unique( tmp.begin(), tmp.end() ), tmp.end() );
619 
620  if( nucleic_acid_ambiguity_string_map.count( tmp ) == 0 ) {
621  throw std::out_of_range( "Invalid nucleic acid codes '" + codes + "'." );
622  }
623  return nucleic_acid_ambiguity_string_map.at( tmp );
624 }
625 
626 } // namespace sequence
627 } // namespace genesis
std::string amino_acid_codes_all()
Return all valid amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWYBJZX*-?".
Definition: codes.cpp:335
static const std::map< char, std::string > nucleic_acid_text_colors_map
Definition: codes.cpp:113
std::string amino_acid_codes_degenerated()
Return all degenerated amino acid codes. Those are "BJZ".
Definition: codes.cpp:325
std::string amino_acid_codes_plain()
Return all plain amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWY".
Definition: codes.cpp:320
std::string nucleic_acid_codes_all()
Return all valid nucleic acid codes. Those are "ACGTUWSMKRYBDHVNOX.-?".
Definition: codes.cpp:309
std::string nucleic_acid_name(char code)
Get the name of a nucleic acid given its IUPAC code.
Definition: codes.cpp:586
static const std::unordered_map< char, std::string > nucleic_acid_code_to_name
Definition: codes.cpp:49
std::string nucleic_acid_codes_plain()
Return all plain nucleic acid codes. Those are "ACGTU".
Definition: codes.cpp:294
Header of Color class.
std::string nucleic_acid_codes_degenerated()
Return all degenerated nucleic acid codes. Those are "WSMKRYBDHV".
Definition: codes.cpp:299
std::string nucleic_acid_codes_undetermined()
Return all undetermined nucleic acid codes. Those are "NOX.-?".
Definition: codes.cpp:304
std::string reverse_complement(std::string const &sequence, bool accept_degenerated)
Get the reverse complement of a nucleic acid sequence.
Definition: codes.cpp:487
std::map< char, std::string > nucleic_acid_text_colors()
Return a map of text colors for each nucleic acid code.
Definition: codes.cpp:562
std::string amino_acid_name(char code)
Get the name of a amino acid given its IUPAC code.
Definition: codes.cpp:595
static const std::unordered_map< char, std::string > amino_acid_code_to_name
Definition: codes.cpp:76
char normalize_amino_acid_code(char code, bool accept_degenerated)
Normalize an amino acid code.
Definition: codes.cpp:414
static const std::map< char, utils::Color > nucleic_acid_colors_map
Definition: codes.cpp:173
Provides some commonly used string utility functions.
std::string normalize_code_alphabet(std::string const &alphabet)
Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.
Definition: codes.cpp:346
char to_upper_ascii(char c)
Return the upper case of a given char, ascii-only.
Definition: char.hpp:89
static const std::unordered_map< char, std::string > nucleic_acid_ambiguity_char_map
Definition: codes.cpp:237
std::map< char, utils::Color > amino_acid_colors()
Return a map of Colors for each amino acid code.
Definition: codes.cpp:577
static const std::map< char, std::string > amino_acid_text_colors_map
Definition: codes.cpp:140
std::map< char, std::string > amino_acid_text_colors()
Return a map of text colors for each amino acid code.
Definition: codes.cpp:567
char normalize_nucleic_acid_code(char code, bool accept_degenerated)
Normalize a nucleic acide code.
Definition: codes.cpp:355
static const std::map< char, utils::Color > amino_acid_colors_map
Definition: codes.cpp:200
std::string nucleic_acid_ambiguities(char code)
Return the possible ambiguous nucleic acid codes for a given code char.
Definition: codes.cpp:604
std::string amino_acid_codes_undetermined()
Return all undetermined amino acid codes. Those are "X*-?".
Definition: codes.cpp:330
static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
Definition: codes.cpp:264
std::map< char, utils::Color > nucleic_acid_colors()
Return a map of Colors for each nucleic acid code.
Definition: codes.cpp:572
char nucleic_acid_ambiguity_code(std::string codes)
Return the nucleic acid code that represents all given codes.
Definition: codes.cpp:613