A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
codes.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cctype>
38 #include <stdexcept>
39 #include <unordered_map>
40 
41 namespace genesis {
42 namespace sequence {
43 
44 // =================================================================================================
45 // Name Lists
46 // =================================================================================================
47 
48 static const std::unordered_map<char, std::string> nucleic_acid_code_to_name = {
49  { 'A', "Adenine" },
50  { 'C', "Cytosine" },
51  { 'G', "Guanine" },
52  { 'T', "Thymine" },
53  { 'U', "Uracil" },
54 
55  { 'W', "Weak" },
56  { 'S', "Strong" },
57  { 'M', "aMino" },
58  { 'K', "Keto" },
59  { 'R', "puRine" },
60  { 'Y', "pYrimidine" },
61 
62  { 'B', "not A" },
63  { 'D', "not C" },
64  { 'H', "not G" },
65  { 'V', "not T" },
66 
67  { 'N', "any" },
68  { 'O', "omitted" },
69  { 'X', "masked" },
70  { '.', "gap" },
71  { '-', "gap" },
72  { '?', "gap" }
73 };
74 
75 static const std::unordered_map<char, std::string> amino_acid_code_to_name = {
76  { 'A', "Alanine" },
77  { 'B', "Aspartic acid or Asparagine" },
78  { 'C', "Cysteine" },
79  { 'D', "Aspartic acid" },
80  { 'E', "Glutamic acid" },
81  { 'F', "Phenylalanine" },
82  { 'G', "Glycine" },
83  { 'H', "Histidine" },
84  { 'I', "Isoleucine" },
85  { 'J', "Leucine or Isoleucine" },
86  { 'K', "Lysine" },
87  { 'L', "Leucine" },
88  { 'M', "Methionine" },
89  { 'N', "Asparagine" },
90  { 'O', "Pyrrolysine" },
91  { 'P', "Proline" },
92  { 'Q', "Glutamine" },
93  { 'R', "Arginine" },
94  { 'S', "Serine" },
95  { 'T', "Threonine" },
96  { 'U', "Selenocysteine" },
97  { 'V', "Valine" },
98  { 'W', "Tryptophan" },
99  { 'Y', "Tyrosine" },
100  { 'Z', "Glutamic acid or Glutamine" },
101 
102  { 'X', "any" },
103  { '*', "translation stop" },
104  { '-', "gap" },
105  { '?', "gap" }
106 };
107 
108 // =================================================================================================
109 // Color Lists
110 // =================================================================================================
111 
112 static const std::map<char, std::string> nucleic_acid_text_colors_map = {
113  { 'A', "Red" },
114  { 'C', "Green" },
115  { 'G', "Yellow" },
116  { 'T', "Blue" },
117  { 'U', "Blue" },
118 
119  { 'W', "DarkGray" },
120  { 'S', "DarkGray" },
121  { 'M', "DarkGray" },
122  { 'K', "DarkGray" },
123  { 'R', "DarkGray" },
124  { 'Y', "DarkGray" },
125 
126  { 'B', "DarkGray" },
127  { 'D', "DarkGray" },
128  { 'H', "DarkGray" },
129  { 'V', "DarkGray" },
130 
131  { 'N', "DarkGray" },
132  { 'O', "DarkGray" },
133  { 'X', "DarkGray" },
134  { '.', "DarkGray" },
135  { '-', "DarkGray" },
136  { '?', "DarkGray" }
137 };
138 
139 static const std::map<char, std::string> amino_acid_text_colors_map = {
140  { 'A', "Blue" },
141  { 'B', "DarkGray" },
142  { 'C', "LightMagenta" },
143  { 'D', "Magenta" },
144  { 'E', "Magenta" },
145  { 'F', "Blue" },
146  { 'G', "LightRed" },
147  { 'H', "Cyan" },
148  { 'I', "Blue" },
149  { 'J', "DarkGray" },
150  { 'K', "Red" },
151  { 'L', "Blue" },
152  { 'M', "Blue" },
153  { 'N', "Green" },
154  { 'O', "DarkGray" },
155  { 'P', "Yellow" },
156  { 'Q', "Green" },
157  { 'R', "Red" },
158  { 'S', "Green" },
159  { 'T', "Green" },
160  { 'U', "DarkGray" },
161  { 'V', "Blue" },
162  { 'W', "Blue" },
163  { 'Y', "Cyan" },
164  { 'Z', "DarkGray" },
165 
166  { 'X', "DarkGray" },
167  { '*', "DarkGray" },
168  { '-', "DarkGray" },
169  { '?', "DarkGray" }
170 };
171 
172 static const std::map<char, utils::Color> nucleic_acid_colors_map = {
173  { 'A', { 1.0, 0.0, 0.0 } },
174  { 'C', { 0.0, 1.0, 0.0 } },
175  { 'G', { 1.0, 1.0, 0.0 } },
176  { 'T', { 0.0, 0.0, 1.0 } },
177  { 'U', { 0.0, 0.0, 1.0 } },
178 
179  { 'W', { 0.376, 0.376, 0.376 } },
180  { 'S', { 0.376, 0.376, 0.376 } },
181  { 'M', { 0.376, 0.376, 0.376 } },
182  { 'K', { 0.376, 0.376, 0.376 } },
183  { 'R', { 0.376, 0.376, 0.376 } },
184  { 'Y', { 0.376, 0.376, 0.376 } },
185 
186  { 'B', { 0.5, 0.5, 0.5 } },
187  { 'D', { 0.5, 0.5, 0.5 } },
188  { 'H', { 0.5, 0.5, 0.5 } },
189  { 'V', { 0.5, 0.5, 0.5 } },
190 
191  { 'N', { 0.5, 0.5, 0.5 } },
192  { 'O', { 0.5, 0.5, 0.5 } },
193  { 'X', { 0.5, 0.5, 0.5 } },
194  { '.', { 0.5, 0.5, 0.5 } },
195  { '-', { 0.5, 0.5, 0.5 } },
196  { '?', { 0.5, 0.5, 0.5 } }
197 };
198 
199 static const std::map<char, utils::Color> amino_acid_colors_map = {
200  { 'A', { 0.098, 0.500, 1.000 } },
201  { 'B', { 0.376, 0.376, 0.376 } },
202  { 'C', { 0.902, 0.500, 0.500 } },
203  { 'D', { 0.800, 0.302, 0.800 } },
204  { 'E', { 0.800, 0.302, 0.800 } },
205  { 'F', { 0.098, 0.500, 1.000 } },
206  { 'G', { 0.902, 0.600, 0.302 } },
207  { 'H', { 0.098, 0.702, 0.702 } },
208  { 'I', { 0.098, 0.500, 1.000 } },
209  { 'J', { 0.376, 0.376, 0.376 } },
210  { 'K', { 0.902, 0.200, 0.098 } },
211  { 'L', { 0.098, 0.500, 1.000 } },
212  { 'M', { 0.098, 0.500, 1.000 } },
213  { 'N', { 0.098, 0.800, 0.098 } },
214  { 'O', { 0.376, 0.376, 0.376 } },
215  { 'P', { 0.800, 0.800, 0.000 } },
216  { 'Q', { 0.098, 0.800, 0.098 } },
217  { 'R', { 0.902, 0.200, 0.098 } },
218  { 'S', { 0.098, 0.800, 0.098 } },
219  { 'T', { 0.098, 0.800, 0.098 } },
220  { 'U', { 0.376, 0.376, 0.376 } },
221  { 'V', { 0.098, 0.500, 1.000 } },
222  { 'W', { 0.098, 0.500, 1.000 } },
223  { 'Y', { 0.098, 0.702, 0.702 } },
224  { 'Z', { 0.376, 0.376, 0.376 } },
225 
226  { 'X', { 0.5, 0.5, 0.5 } },
227  { '*', { 0.5, 0.5, 0.5 } },
228  { '-', { 0.5, 0.5, 0.5 } },
229  { '?', { 0.5, 0.5, 0.5 } }
230 };
231 
232 // =================================================================================================
233 // Ambiguity Lists
234 // =================================================================================================
235 
236 static const std::unordered_map<char, std::string> nucleic_acid_ambiguity_char_map = {
237  { 'A', "A" },
238  { 'C', "C" },
239  { 'G', "G" },
240  { 'T', "T" },
241  { 'U', "T" },
242 
243  { 'W', "AT" },
244  { 'S', "CG" },
245  { 'M', "AC" },
246  { 'K', "GT" },
247  { 'R', "AG" },
248  { 'Y', "CT" },
249 
250  { 'B', "CGT" },
251  { 'D', "AGT" },
252  { 'H', "ACT" },
253  { 'V', "ACG" },
254 
255  { 'N', "ACGT" },
256  { 'O', "-" },
257  { 'X', "-" },
258  { '.', "-" },
259  { '-', "-" },
260  { '?', "-" }
261 };
262 
263 static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map = {
264  { "A", 'A' },
265  { "C", 'C' },
266  { "G", 'G' },
267  { "T", 'T' },
268 
269  { "AT", 'W' },
270  { "CG", 'S' },
271  { "AC", 'M' },
272  { "GT", 'K' },
273  { "AG", 'R' },
274  { "CT", 'Y' },
275 
276  { "CGT", 'B' },
277  { "AGT", 'D' },
278  { "ACT", 'H' },
279  { "ACG", 'V' },
280 
281  { "ACGT", 'N' },
282  { "-", '-' }
283 };
284 
285 // =================================================================================================
286 // Codes
287 // =================================================================================================
288 
289 // ---------------------------------------------------------------------
290 // Nucleic Acids
291 // ---------------------------------------------------------------------
292 
294 {
295  return "ACGT";
296 }
297 
299 {
300  return "WSMKRYBDHV";
301 }
302 
304 {
305  return "NOX.-?";
306 }
307 
309 {
310  return nucleic_acid_codes_plain()
313 }
314 
315 // ---------------------------------------------------------------------
316 // Amino Acids
317 // ---------------------------------------------------------------------
318 
320 {
321  return "ACDEFGHIKLMNOPQRSTUVWY";
322 }
323 
325 {
326  return "BJZ";
327 }
328 
330 {
331  return "X*-?";
332 }
333 
334 std::string amino_acid_codes_all()
335 {
336  return amino_acid_codes_plain()
339 }
340 
341 // ---------------------------------------------------------------------
342 // Misc
343 // ---------------------------------------------------------------------
344 
345 std::string normalize_codes( std::string const& alphabet )
346 {
347  // Uppercase, sort, uniq the alphabet.
348  auto normalized = utils::to_upper_ascii( alphabet );
349  std::sort( normalized.begin(), normalized.end() );
350  normalized.erase( std::unique( normalized.begin(), normalized.end() ), normalized.end() );
351  return normalized;
352 }
353 
354 // =================================================================================================
355 // Color Codes
356 // =================================================================================================
357 
358 std::map<char, std::string> nucleic_acid_text_colors()
359 {
361 }
362 
363 std::map<char, std::string> amino_acid_text_colors()
364 {
366 }
367 
368 std::map<char, utils::Color> nucleic_acid_colors()
369 {
371 }
372 
373 std::map<char, utils::Color> amino_acid_colors()
374 {
375  return amino_acid_colors_map;
376 }
377 
378 // =================================================================================================
379 // Translate Codes
380 // =================================================================================================
381 
382 std::string nucleic_acid_name( char code )
383 {
384  auto ucode = toupper(code);
385  if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
386  throw std::out_of_range( "Invalid nucleic acid code '" + std::string( 1, code ) + "'." );
387  }
388  return nucleic_acid_code_to_name.at( ucode );
389 }
390 
391 std::string amino_acid_name( char code )
392 {
393  auto ucode = toupper(code);
394  if( amino_acid_code_to_name.count( ucode ) == 0 ) {
395  throw std::out_of_range( "Invalid amino acid code '" + std::string( 1, code ) + "'." );
396  }
397  return amino_acid_code_to_name.at( ucode );
398 }
399 
400 std::string nucleic_acid_ambiguities( char code )
401 {
402  auto ucode = toupper(code);
403  if( nucleic_acid_code_to_name.count( ucode ) == 0 ) {
404  throw std::out_of_range( "Invalid nucleic acid code '" + std::string( 1, code ) + "'." );
405  }
406  return nucleic_acid_ambiguity_char_map.at( ucode );
407 }
408 
409 char nucleic_acid_ambiguity_code( std::string codes )
410 {
411  // Uppercase, sort, uniq the codes.
412  auto tmp = utils::to_upper_ascii( codes );
413  std::sort( tmp.begin(), tmp.end() );
414  tmp.erase( std::unique( tmp.begin(), tmp.end() ), tmp.end() );
415 
416  if( nucleic_acid_ambiguity_string_map.count( tmp ) == 0 ) {
417  throw std::out_of_range( "Invalid nucleic acid codes '" + codes + "'." );
418  }
419  return nucleic_acid_ambiguity_string_map.at( tmp );
420 }
421 
422 } // namespace sequence
423 } // namespace genesis
std::string amino_acid_codes_all()
Return all valid amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWYBJZX*-?".
Definition: codes.cpp:334
static const std::map< char, std::string > nucleic_acid_text_colors_map
Definition: codes.cpp:112
std::string amino_acid_codes_degenerated()
Return all degenerated amino acid codes. Those are "BJZ".
Definition: codes.cpp:324
std::string amino_acid_codes_plain()
Return all plain amino acid codes. Those are "ACDEFGHIKLMNOPQRSTUVWY".
Definition: codes.cpp:319
std::string nucleic_acid_codes_all()
Return all valid nucleic acid codes. Those are "ACGTUWSMKRYBDHVNOX.-?".
Definition: codes.cpp:308
std::string nucleic_acid_name(char code)
Get the name of a nucleic acid given its IUPAC code.
Definition: codes.cpp:382
static const std::unordered_map< char, std::string > nucleic_acid_code_to_name
Definition: codes.cpp:48
std::string nucleic_acid_codes_plain()
Return all plain nucleic acid codes. Those are "ACGTU".
Definition: codes.cpp:293
Header of Color class.
std::string normalize_codes(std::string const &alphabet)
Normalize a set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates...
Definition: codes.cpp:345
std::string nucleic_acid_codes_degenerated()
Return all degenerated nucleic acid codes. Those are "WSMKRYBDHV".
Definition: codes.cpp:298
std::string nucleic_acid_codes_undetermined()
Return all undetermined nucleic acid codes. Those are "NOX.-?".
Definition: codes.cpp:303
std::map< char, std::string > nucleic_acid_text_colors()
Return a map of text colors for each nucleic acid code.
Definition: codes.cpp:358
std::string amino_acid_name(char code)
Get the name of a amino acid given its IUPAC code.
Definition: codes.cpp:391
static const std::unordered_map< char, std::string > amino_acid_code_to_name
Definition: codes.cpp:75
static const std::map< char, utils::Color > nucleic_acid_colors_map
Definition: codes.cpp:172
Provides some commonly used string utility functions.
static const std::unordered_map< char, std::string > nucleic_acid_ambiguity_char_map
Definition: codes.cpp:236
std::map< char, utils::Color > amino_acid_colors()
Return a map of Colors for each amino acid code.
Definition: codes.cpp:373
static const std::map< char, std::string > amino_acid_text_colors_map
Definition: codes.cpp:139
std::map< char, std::string > amino_acid_text_colors()
Return a map of text colors for each amino acid code.
Definition: codes.cpp:363
static const std::map< char, utils::Color > amino_acid_colors_map
Definition: codes.cpp:199
std::string nucleic_acid_ambiguities(char code)
Return the possible ambiguous nucleic acid codes for a given code char.
Definition: codes.cpp:400
char to_upper_ascii(char c)
Return the upper case of a given char, ascii-only.
Definition: string.cpp:286
std::string amino_acid_codes_undetermined()
Return all undetermined amino acid codes. Those are "X*-?".
Definition: codes.cpp:329
static const std::unordered_map< std::string, char > nucleic_acid_ambiguity_string_map
Definition: codes.cpp:263
std::map< char, utils::Color > nucleic_acid_colors()
Return a map of Colors for each nucleic acid code.
Definition: codes.cpp:368
char nucleic_acid_ambiguity_code(std::string codes)
Return the nucleic acid code that represents all given codes.
Definition: codes.cpp:409