A library for working with phylogenetic and population genetic data.
v0.27.0
quality.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
39 
40 #include <algorithm>
41 #include <array>
42 #include <cassert>
43 #include <cmath>
44 #include <cstdint>
45 #include <stdexcept>
46 
47 #ifdef GENESIS_AVX
48  #include <immintrin.h>
49 #endif
50 
51 namespace genesis {
52 namespace sequence {
53 
54 // =================================================================================================
55 // Phred Score To Error Probability
56 // =================================================================================================
57 
58 // Local array for faster lookup of phred score error probabilities.
59 // Code to generate the array:
60 //
61 // #include <iostream>
62 // std::cout.precision( 15 );
63 // size_t c = 0;
64 // for( size_t i = 0; i < 256; ++i ) {
65 // if( ++c == 4 || i % 10 == 0 ) {
66 // c = 0;
67 // std::cout << "\n ";
68 // }
69 // if( i % 10 == 0 ) {
70 // std::cout << "/* " << i << " */ ";
71 // }
72 // std::cout << std::pow( 10.0, static_cast<double>( i ) / -10.0 ) << ", ";
73 // }
74 // std::cout << "\n";
75 //
76 // See phred_score_to_error_probability() for usage. We probably never ever need the values above 93,
77 // as those cannot even be encoded in normal ASCII-based phred score codes, and also probably
78 // don't even need values above 60, which is usually taken as an upper bound. But let's be through
79 // here, and include the whole range of possible values for our data type.
80 static const std::array<double, 256> phred_score_to_error_probability_lookup_ = {{
81  /* 0 */ 1.0, 0.794328234724281, 0.630957344480193, 0.501187233627272,
82  0.398107170553497, 0.316227766016838, 0.251188643150958, 0.199526231496888,
83  0.158489319246111, 0.125892541179417,
84  /* 10 */ 0.1, 0.0794328234724281, 0.0630957344480193, 0.0501187233627272,
85  0.0398107170553497, 0.0316227766016838, 0.0251188643150958, 0.0199526231496888,
86  0.0158489319246111, 0.0125892541179417,
87  /* 20 */ 0.01, 0.00794328234724281, 0.00630957344480193, 0.00501187233627272,
88  0.00398107170553497, 0.00316227766016838, 0.00251188643150958, 0.00199526231496888,
89  0.00158489319246111, 0.00125892541179417,
90  /* 30 */ 0.001, 0.000794328234724281, 0.000630957344480193, 0.000501187233627273,
91  0.000398107170553497, 0.000316227766016838, 0.000251188643150958, 0.000199526231496888,
92  0.000158489319246111, 0.000125892541179417,
93  /* 40 */ 0.0001, 7.94328234724282e-05, 6.30957344480193e-05, 5.01187233627273e-05,
94  3.98107170553497e-05, 3.16227766016838e-05, 2.51188643150958e-05, 1.99526231496888e-05,
95  1.58489319246111e-05, 1.25892541179417e-05,
96  /* 50 */ 1e-05, 7.94328234724282e-06, 6.30957344480193e-06, 5.01187233627272e-06,
97  3.98107170553497e-06, 3.16227766016838e-06, 2.51188643150958e-06, 1.99526231496888e-06,
98  1.58489319246111e-06, 1.25892541179417e-06,
99  /* 60 */ 1e-06, 7.94328234724282e-07, 6.30957344480193e-07, 5.01187233627272e-07,
100  3.98107170553497e-07, 3.16227766016838e-07, 2.51188643150958e-07, 1.99526231496888e-07,
101  1.58489319246111e-07, 1.25892541179417e-07,
102  /* 70 */ 1e-07, 7.94328234724282e-08, 6.30957344480193e-08, 5.01187233627272e-08,
103  3.98107170553497e-08, 3.16227766016838e-08, 2.51188643150958e-08, 1.99526231496888e-08,
104  1.58489319246111e-08, 1.25892541179417e-08,
105  /* 80 */ 1e-08, 7.94328234724282e-09, 6.30957344480194e-09, 5.01187233627271e-09,
106  3.98107170553497e-09, 3.16227766016838e-09, 2.51188643150958e-09, 1.99526231496888e-09,
107  1.58489319246111e-09, 1.25892541179417e-09,
108  /* 90 */ 1e-09, 7.94328234724282e-10, 6.30957344480194e-10, 5.01187233627271e-10,
109  3.98107170553497e-10, 3.16227766016838e-10, 2.51188643150958e-10, 1.99526231496888e-10,
110  1.58489319246111e-10, 1.25892541179417e-10,
111  /* 100 */ 1e-10, 7.94328234724282e-11, 6.30957344480194e-11, 5.01187233627271e-11,
112  3.98107170553497e-11, 3.16227766016838e-11, 2.51188643150958e-11, 1.99526231496888e-11,
113  1.58489319246111e-11, 1.25892541179417e-11,
114  /* 110 */ 1e-11, 7.94328234724282e-12, 6.30957344480194e-12, 5.01187233627271e-12,
115  3.98107170553497e-12, 3.16227766016838e-12, 2.51188643150958e-12, 1.99526231496888e-12,
116  1.58489319246111e-12, 1.25892541179417e-12,
117  /* 120 */ 1e-12, 7.94328234724282e-13, 6.30957344480194e-13, 5.01187233627271e-13,
118  3.98107170553497e-13, 3.16227766016838e-13, 2.51188643150958e-13, 1.99526231496888e-13,
119  1.58489319246111e-13, 1.25892541179417e-13,
120  /* 130 */ 1e-13, 7.94328234724282e-14, 6.30957344480194e-14, 5.01187233627271e-14,
121  3.98107170553497e-14, 3.16227766016838e-14, 2.51188643150958e-14, 1.99526231496888e-14,
122  1.58489319246111e-14, 1.25892541179417e-14,
123  /* 140 */ 1e-14, 7.94328234724282e-15, 6.30957344480194e-15, 5.01187233627271e-15,
124  3.98107170553497e-15, 3.16227766016838e-15, 2.51188643150958e-15, 1.99526231496888e-15,
125  1.58489319246111e-15, 1.25892541179417e-15,
126  /* 150 */ 1e-15, 7.94328234724282e-16, 6.30957344480194e-16, 5.01187233627271e-16,
127  3.98107170553497e-16, 3.16227766016838e-16, 2.51188643150958e-16, 1.99526231496888e-16,
128  1.58489319246111e-16, 1.25892541179417e-16,
129  /* 160 */ 1e-16, 7.94328234724279e-17, 6.30957344480194e-17, 5.01187233627271e-17,
130  3.98107170553499e-17, 3.16227766016838e-17, 2.51188643150957e-17, 1.99526231496888e-17,
131  1.58489319246111e-17, 1.25892541179417e-17,
132  /* 170 */ 1e-17, 7.94328234724279e-18, 6.30957344480194e-18, 5.01187233627271e-18,
133  3.98107170553499e-18, 3.16227766016838e-18, 2.51188643150957e-18, 1.99526231496888e-18,
134  1.58489319246111e-18, 1.25892541179417e-18,
135  /* 180 */ 1e-18, 7.94328234724279e-19, 6.30957344480194e-19, 5.01187233627271e-19,
136  3.98107170553499e-19, 3.16227766016838e-19, 2.51188643150957e-19, 1.99526231496888e-19,
137  1.58489319246111e-19, 1.25892541179417e-19,
138  /* 190 */ 1e-19, 7.94328234724279e-20, 6.30957344480194e-20, 5.01187233627271e-20,
139  3.98107170553499e-20, 3.16227766016838e-20, 2.51188643150957e-20, 1.99526231496888e-20,
140  1.58489319246111e-20, 1.25892541179417e-20,
141  /* 200 */ 1e-20, 7.94328234724279e-21, 6.30957344480194e-21, 5.01187233627271e-21,
142  3.98107170553499e-21, 3.16227766016838e-21, 2.51188643150957e-21, 1.99526231496888e-21,
143  1.58489319246111e-21, 1.25892541179417e-21,
144  /* 210 */ 1e-21, 7.94328234724279e-22, 6.30957344480194e-22, 5.01187233627272e-22,
145  3.98107170553499e-22, 3.16227766016838e-22, 2.51188643150957e-22, 1.99526231496888e-22,
146  1.58489319246111e-22, 1.25892541179417e-22,
147  /* 220 */ 1e-22, 7.94328234724279e-23, 6.30957344480194e-23, 5.01187233627271e-23,
148  3.98107170553499e-23, 3.16227766016838e-23, 2.51188643150957e-23, 1.99526231496888e-23,
149  1.58489319246111e-23, 1.25892541179417e-23,
150  /* 230 */ 1e-23, 7.94328234724279e-24, 6.30957344480194e-24, 5.01187233627271e-24,
151  3.98107170553499e-24, 3.16227766016838e-24, 2.51188643150957e-24, 1.99526231496888e-24,
152  1.58489319246111e-24, 1.25892541179417e-24,
153  /* 240 */ 1e-24, 7.94328234724279e-25, 6.30957344480194e-25, 5.01187233627272e-25,
154  3.98107170553499e-25, 3.16227766016838e-25, 2.51188643150957e-25, 1.99526231496888e-25,
155  1.58489319246111e-25, 1.25892541179417e-25,
156  /* 250 */ 1e-25, 7.94328234724279e-26, 6.30957344480194e-26, 5.01187233627271e-26,
157  3.98107170553499e-26, 3.16227766016838e-26
158 }};
159 
160 // =================================================================================================
161 // Quality Encoding and Decoding
162 // =================================================================================================
163 
167 inline void throw_invalid_quality_code_( char quality_code, QualityEncoding encoding )
168 {
169  throw std::invalid_argument(
170  "Invalid quality code: " + utils::char_to_hex( quality_code ) +
171  " is not in the valid range for " + quality_encoding_name( encoding ) + " quality codes."
172  );
173 }
174 
175 std::string quality_encoding_name( QualityEncoding encoding )
176 {
177  switch( encoding ) {
179  return "Sanger";
181  return "Illumina 1.3+";
183  return "Illumina 1.5+";
185  return "Illumina 1.8+";
187  return "Solexa";
188 
189  default:
190  throw std::invalid_argument( "Invalid quality encoding type." );
191  };
192 }
193 
195 {
196  // Transform the string into a uniform form.
197  auto s = utils::to_lower( name );
198  s.erase( std::remove_if(s.begin(), s.end(), []( char c ){
199  return ! utils::is_alnum( c );
200  }), s.end() );
201 
202  // Match the enum.
203  if( s == "sanger" ) {
205  } else if( s == "illumina13" ) {
207  } else if( s == "illumina15" ) {
209  } else if( s == "illumina18" || s == "illumina" ) {
211  } else if( s == "solexa" ) {
213  } else {
214  throw std::runtime_error( "Invalid quality encoding name: \"" + name + "\"" );
215  }
216 }
217 
218 unsigned char quality_decode_to_phred_score( char quality_code, QualityEncoding encoding )
219 {
220  // Convert using an offset. It's as simple as that.
221  // Except that we have different offsets for different fastq encoding styles.
222  // And also, Solexa needs special treatment, as we internally use phred scores only.
223  // Basically, fastq is again one of those weird bioinformatics file formats that drives
224  // everyone crazy by being ill-defined, and having contraticting variants and conventions...
225  // NB: We do not check for upper bounds of the scores here, as higher-quality scores can come
226  // from downstream processing.
227  switch( encoding ) {
230  if( quality_code < 33 || quality_code >= 127 ) {
231  throw_invalid_quality_code_( quality_code, encoding );
232  }
233  return static_cast<unsigned char>( quality_code ) - 33;
234 
237  if( quality_code < 64 || quality_code >= 127 ) {
238  throw_invalid_quality_code_( quality_code, encoding );
239  }
240  return static_cast<unsigned char>( quality_code ) - 64;
241 
243  if( quality_code < 59 || quality_code >= 127 ) {
244  throw_invalid_quality_code_( quality_code, encoding );
245  }
246  return solexa_score_to_phred_score( static_cast<unsigned char>( quality_code ) - 64 );
247 
248  default:
249  throw std::invalid_argument( "Invalid quality encoding type." );
250  };
251 }
252 
253 std::vector<unsigned char> quality_decode_to_phred_score(
254  std::string const& quality_codes,
255  QualityEncoding encoding
256 ) {
257  // Reserve space for the result. Do this first, to facilitate copy elision.
258  auto result = std::vector<unsigned char>( quality_codes.size() );
259 
260  // Only switch on the encoding once, for speed. We use a fake offset for Solexa,
261  // as they can go in the negative range to -5. Doing it this way makes our error checking
262  // code consistent. We correct for this later in the Solxa conversion loop below.
263  unsigned char offset;
264  switch( encoding ) {
267  offset = 33;
268  break;
269 
272  offset = 64;
273  break;
274 
276  offset = 59;
277  break;
278 
279  default:
280  throw std::invalid_argument( "Invalid quality encoding type." );
281  };
282 
283  // We use this as an indicator whether we found an error in the sequence.
284  // This allows to run the inner loops without branching.
285  bool good = true;
286 
287  // We use a loop variable here that is shared between the AVX and normal version,
288  // so that we do not have to duplicate the loop.
289  size_t i = 0;
290 
291  // In the loops below, we convert Solexa as if it was phred, and fix this later.
292  // This avoids code duplication for the error checking.
293 
294  #ifdef GENESIS_AVX
295 
296  // Get the data in the right format
297  auto const* data = reinterpret_cast<__m256i const*>( quality_codes.c_str() );
298  auto* write = reinterpret_cast<__m256i*>( result.data() );
299 
300  // Define some masks. We use 32 byte full of the offset, and full of the upper limit for
301  // ascii-encoded phred scores.
302  auto const m_offset = _mm256_set1_epi8( offset );
303  auto static const m_upper = _mm256_set1_epi8( 126 );
304 
305  // Process in 32-byte chunks. The loop condition uses integer division / 32, so we miss the rest
306  // that is above a multiple of 32. We process this later.
307  for( size_t j = 0; j < quality_codes.size() / 32; ++j ) {
308 
309  // Load the data and compare it to the offset and upper limit.
310  auto const m_data = _mm256_loadu_si256( data + j );
311  auto const m_min = _mm256_cmpgt_epi8( m_offset, m_data );
312  auto const m_max = _mm256_cmpgt_epi8( m_data, m_upper );
313 
314  // If any char is below the offset or above the upper limit, we have an error.
315  // We just store the result of the test, to avoid branching.
316  good &= _mm256_testz_si256( m_min, m_min ) && _mm256_testz_si256( m_max, m_max );
317 
318  // Subtract the offset, and store the result.
319  auto const m_result = _mm256_sub_epi8( m_data, m_offset );
320  _mm256_storeu_si256( write + j, m_result );
321  }
322 
323  // Set i to what it needs to be so that the loop below processes the remaining chars that are
324  // left in the string after AVX is done with the 32 byte chunks.
325  i = quality_codes.size() / 32 * 32;
326 
327  #endif // GENESIS_AVX
328 
329  // Run the conversion. Again, we avoid branching in the inner loop.
330  // i is already initialized, either from before the AVX code, or from within the AVX code.
331  // This avoids duplicating the below loop for the rest of the chars that are in the remainder
332  // of the string after AVX has processed all 32 byte chunks.
333  for( ; i < quality_codes.size(); ++i ) {
334  good &= ( quality_codes[i] >= offset ) && ( quality_codes[i] <= 126 );
335  result[i] = static_cast<unsigned char>( quality_codes[i] ) - offset;
336  }
337 
338  // If we found an error in the sequence, throw an exception. We run the whole sequence again,
339  // to get the exact char that is bad. This is only in the error case, so we do not care about speed.
340  if( !good ) {
341  for( size_t i = 0; i < quality_codes.size(); ++i ) {
342  if( quality_codes[i] < offset || quality_codes[i] >= 127 ) {
343  throw_invalid_quality_code_( quality_codes[i], encoding );
344  }
345  }
346 
347  // We cannot get here, otherwise, the !good condition would not have led us here.
348  assert( false );
349  }
350 
351  // Use 64bit words instead of vectorization. Maybe later, as an alternative to AVX.
352  // Macros from https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
353  //
354  // #define hasless(x,n) (((x)-~static_cast<uint64_t>(0)/255U*(n))&~(x)&~static_cast<uint64_t>(0)/255U*128)
355  // #define hasmore(x,n) (((x)+~static_cast<uint64_t>(0)/255U*(127-(n))|(x))&~static_cast<uint64_t>(0)/255U*128)
356  //
357  // auto const subtrahend = ~static_cast<uint64_t>(0) / 255U * offset;
358  //
359  // auto const* data = reinterpret_cast<uint64_t const*>( quality_codes.c_str() );
360  // auto* write = reinterpret_cast<uint64_t*>( result.data() );
361  // bool failed = false;
362  //
363  // for( size_t i = 0; i < quality_codes.size() / 8; ++i ) {
364  // bool const l = hasless( data[i], offset );
365  // bool const m = hasmore( data[i], 126 );
366  // failed |= l | m;
367  //
368  // write[i] = data[i] - subtrahend;
369  // }
370  //
371  // #undef hasless
372  // #undef hasmore
373  //
374  // if( failed ) {
375  // throw std::invalid_argument(
376  // "Invalid quality code that is not in the valid range for " +
377  // quality_encoding_name( encoding ) + " quality codes found in sequence."
378  // );
379  // }
380 
381  // For Solexa, we iterate the sequence again in order to convert it to phred.
382  // This is slower and could be avoided with a bit of code duplication, but no one uses that
383  // format anyway any more, so that case should be rare.
384  if( encoding == QualityEncoding::kSolexa ) {
385  for( size_t i = 0; i < quality_codes.size(); ++i ) {
386  result[i] = solexa_score_to_phred_score( result[i] - 5 );
387  }
388  }
389 
390  // Finally, we are done.
391  return result;
392 }
393 
394 // =================================================================================================
395 // Guess Quality Encoding Type
396 // =================================================================================================
397 
398 QualityEncoding guess_quality_encoding( std::array<size_t, 128> const& char_counts )
399 {
400  // Find the first entry that is not 0
401  size_t min = 0;
402  for( size_t i = 0; i < char_counts.size(); ++i ) {
403  if( char_counts[i] > 0 ) {
404  min = i;
405  break;
406  }
407  }
408 
409  // Find the last entry that is not 0
410  size_t max = 128;
411  for( size_t i = char_counts.size(); i > 0; --i ) {
412  if( char_counts[i-1] > 0 ) {
413  max = i-1;
414  break;
415  }
416  }
417 
418  // Some checks.
419  if( min < 33 || max >= 127 ) {
420  throw std::runtime_error(
421  "Invalid char counts provided to guess quality score encoding. Only printable "
422  "characters (ASCII range 33 to 127) are allowed in fastq quality encodings."
423  );
424  }
425  assert( min <= max );
426 
427  // Sanger and Illumina 1.8 use an offset of 33. The next higher offset is 64, but with Solexa ranging into the negative until -5, we find that anything below 64-5=59 cannot have the 64 offset, and hence must have the 33 offset.
428  if( min < 59 ) {
429  // Sanger and Illumina 1.8 are basically the same, so it does not make a difference
430  // whether we detect them correctly or not. However, we can still try to guess, for
431  // completeness. Illumina 1.8 seems to have one more character that can be used. Thus, if
432  // this character occurs, we can be sure. If not, it might just be that no base was that
433  // accurate. But then, it doesn't really matter anyway.
434  if( max > 73 ) {
436  } else {
438  }
439  }
440 
441  // Solexa goes down to a score of -5, with an offset of 64 for 0, so anything below 64 is
442  // negative, meaning that it cannot be Illumina 1.3 or 1.5.
443  if( min < 64 ) {
445  }
446 
447  // At this point, we could use a heuristic to test how common 'B' is, which is special in Illumina 1.5,
448  // see https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py for
449  // details. This would enable more fine-grained distinction between Illumina 1.3 and 1.5.
450  // But for now, we simply assume that an encoding without anything before 'B' is Illumina 1.5
451  if( min < 66 ) {
453  }
455 }
456 
457 QualityEncoding guess_fastq_quality_encoding( std::shared_ptr< utils::BaseInputSource > source )
458 {
459  // Init a counting array for each char, use value-initialization to 0
460  auto char_counts = std::array<size_t, 128>();
461 
462  // Prepare a reader that simply increments all char counts for the quality chars
463  // that are found in the sequences.
464  auto reader = FastqReader();
465  reader.quality_string_plugin(
466  [&]( std::string const& quality_string, Sequence& sequence )
467  {
468  (void) sequence;
469  for( auto q : quality_string ) {
470 
471  // Cast, so that we catch the issue that char can be signed or unsigned,
472  // depending on the compiler. This might cause unnecessary warnings otherwise.
473  auto qu = static_cast<unsigned char>(q);
474  if( qu > 127 ) {
475  throw std::invalid_argument(
476  "Invalid quality score character outside of the ASCII range."
477  );
478  }
479 
480  // assert( (qu & ~0x7f) == 0 );
481  assert( qu < char_counts.size() );
482  ++char_counts[ qu ];
483  }
484  }
485  );
486 
487  // Read the input, sequence by sequence.
488  utils::InputStream input_stream( source );
489  Sequence seq;
490  while( reader.parse_sequence( input_stream, seq ) ) {
491  // Do nothing. All the work is done in the plugin function above.
492  }
493 
494  // Return our guess based on the quality characters that were found in the sequences.
495  return guess_quality_encoding( char_counts );
496 }
497 
498 // =================================================================================================
499 // Quality Computations
500 // =================================================================================================
501 
502 unsigned char error_probability_to_phred_score( double error_probability )
503 {
504  if( ! std::isfinite( error_probability ) || error_probability < 0.0 || error_probability > 1.0 ) {
505  throw std::invalid_argument(
506  "Cannot convert error probability outside of range [0.0, 1.0] to phred score."
507  );
508  }
509 
510  // Compute the value and put into the valid range for unsigned chars. This might exceed
511  // the encoding that is later used to store the scores in fastq, but this does not concern us
512  // here. Instead, we offer the full range here, and clamp later to the value range when encoding.
513  double v = std::round( -10.0 * std::log10( error_probability ));
514  v = std::min( v, 255.0 );
515  return static_cast<unsigned char>( v );
516 }
517 
518 double phred_score_to_error_probability( unsigned char phred_score )
519 {
520  return phred_score_to_error_probability_lookup_[ phred_score ];
521  // return std::pow( 10.0, static_cast<double>( phred_score ) / -10.0 );
522 }
523 
524 signed char error_probability_to_solexa_score( double error_probability )
525 {
526  if( ! std::isfinite( error_probability ) || error_probability < 0.0 || error_probability > 1.0 ) {
527  throw std::invalid_argument(
528  "Cannot convert error probability outside of range [0.0, 1.0] to solexa score."
529  );
530  }
531 
532  // The following are the limits than can be encoded in typical fastq-solexa encoding.
533  // We are not using them here, but instead use more relaxed limts, and will apply the actual
534  // limits only when encoding to fastq.
535  // // min that can be encoded in fastq with solexa encoding
536  // if( error_probability < 6.30957344e-7 ) {
537  // return 62;
538  // }
539  // // max that can be encoded in fastq with solexa encoding
540  // if( error_probability > 0.75 ) {
541  // return -5;
542  // }
543 
544  // Compute the score, and check its boundaries. We did a check before, so this is duplication,
545  // but leads to nicer user output.
546  double v = std::round( -10.0 * std::log10( error_probability / ( 1.0 - error_probability )));
547  v = std::min( v, 127.0 );
548  v = std::max( v, -128.0 );
549  return static_cast<signed char>( v );
550 }
551 
552 double solexa_score_to_error_probability( signed char solexa_score )
553 {
554  if( solexa_score < -5 ) {
555  solexa_score = -5;
556  }
557  double const t = std::pow( 10.0, static_cast<double>( solexa_score ) / -10.0 );
558  return t / ( 1.0 + t );
559 }
560 
561 signed char phred_score_to_solexa_score( unsigned char phred_score )
562 {
563  if( phred_score <= 1 ) {
564  return -5;
565  }
566  double v = std::round(
567  10.0 * std::log10( std::pow( 10.0, static_cast<double>( phred_score ) / 10.0 ) - 1.0 )
568  );
569  v = std::min( v, 127.0 );
570  return static_cast<signed char>( v );
571 }
572 
573 unsigned char solexa_score_to_phred_score( signed char solexa_score )
574 {
575  // This cannot overflow, so no need to check.
576  return static_cast<unsigned char>( std::round(
577  10.0 * std::log10( std::pow( 10.0, static_cast<double>( solexa_score ) / 10.0 ) + 1.0 )
578  ));
579 }
580 
581 std::vector<unsigned char> error_probability_to_phred_score( std::vector<double> error_probability )
582 {
583  auto res = std::vector<unsigned char>( error_probability.size(), 0 );
584  for( size_t i = 0; i < error_probability.size(); ++i ) {
585  res[i] = error_probability_to_phred_score( error_probability[i] );
586  }
587  return res;
588 }
589 
590 std::vector<double> phred_score_to_error_probability( std::vector<unsigned char> phred_score )
591 {
592  auto res = std::vector<double>( phred_score.size(), 0 );
593  for( size_t i = 0; i < phred_score.size(); ++i ) {
594  res[i] = phred_score_to_error_probability( phred_score[i] );
595  }
596  return res;
597 }
598 
599 std::vector<signed char> error_probability_to_solexa_score( std::vector<double> error_probability )
600 {
601  auto res = std::vector<signed char>( error_probability.size(), 0 );
602  for( size_t i = 0; i < error_probability.size(); ++i ) {
603  res[i] = error_probability_to_solexa_score( error_probability[i] );
604  }
605  return res;
606 }
607 
608 std::vector<double> solexa_score_to_error_probability( std::vector<signed char> solexa_score )
609 {
610  auto res = std::vector<double>( solexa_score.size(), 0 );
611  for( size_t i = 0; i < solexa_score.size(); ++i ) {
612  res[i] = solexa_score_to_error_probability( solexa_score[i] );
613  }
614  return res;
615 }
616 
617 std::vector<signed char> phred_score_to_solexa_score( std::vector<unsigned char> phred_score )
618 {
619  auto res = std::vector<signed char>( phred_score.size(), 0 );
620  for( size_t i = 0; i < phred_score.size(); ++i ) {
621  res[i] = phred_score_to_solexa_score( phred_score[i] );
622  }
623  return res;
624 }
625 
626 std::vector<unsigned char> solexa_score_to_phred_score( std::vector<signed char> solexa_score )
627 {
628  auto res = std::vector<unsigned char>( solexa_score.size(), 0 );
629  for( size_t i = 0; i < solexa_score.size(); ++i ) {
630  res[i] = solexa_score_to_phred_score( solexa_score[i] );
631  }
632  return res;
633 }
634 
635 } // namespace sequence
636 } // namespace genesis
genesis::sequence::phred_score_to_error_probability
double phred_score_to_error_probability(unsigned char phred_score)
Definition: quality.cpp:518
genesis::sequence::quality_decode_to_phred_score
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score.
Definition: quality.cpp:218
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::sequence::phred_score_to_error_probability_lookup_
static const std::array< double, 256 > phred_score_to_error_probability_lookup_
Definition: quality.cpp:80
genesis::sequence::throw_invalid_quality_code_
void throw_invalid_quality_code_(char quality_code, QualityEncoding encoding)
Local helper function to throw if any invalid fastq quality chars are being used.
Definition: quality.cpp:167
fastq_reader.hpp
genesis::sequence::guess_quality_encoding_from_name
QualityEncoding guess_quality_encoding_from_name(std::string const &name)
Guess the QualityEncoding type, given its description name.
Definition: quality.cpp:194
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::QualityEncoding::kSanger
@ kSanger
genesis::sequence::QualityEncoding::kIllumina15
@ kIllumina15
genesis::sequence::FastqReader
Read Fastq sequence data.
Definition: fastq_reader.hpp:149
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
genesis::sequence::QualityEncoding::kSolexa
@ kSolexa
input_source.hpp
genesis::sequence::QualityEncoding::kIllumina13
@ kIllumina13
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::sequence::QualityEncoding::kIllumina18
@ kIllumina18
genesis::sequence::quality_encoding_name
std::string quality_encoding_name(QualityEncoding encoding)
Return a readable name for each of the encoding types.
Definition: quality.cpp:175
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::sequence::guess_fastq_quality_encoding
QualityEncoding guess_fastq_quality_encoding(std::shared_ptr< utils::BaseInputSource > source)
Guess the quality score encoding for a fastq input, based on counts of how often each char appeared i...
Definition: quality.cpp:457
genesis::sequence::error_probability_to_solexa_score
signed char error_probability_to_solexa_score(double error_probability)
Definition: quality.cpp:524
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::guess_quality_encoding
QualityEncoding guess_quality_encoding(std::array< size_t, 128 > const &char_counts)
Guess the quality score encoding, based on counts of how often each char appeared in the quality stri...
Definition: quality.cpp:398
char.hpp
genesis::sequence::phred_score_to_solexa_score
signed char phred_score_to_solexa_score(unsigned char phred_score)
Definition: quality.cpp:561
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::sequence::solexa_score_to_phred_score
unsigned char solexa_score_to_phred_score(signed char solexa_score)
Definition: quality.cpp:573
genesis::sequence::solexa_score_to_error_probability
double solexa_score_to_error_probability(signed char solexa_score)
Definition: quality.cpp:552
sequence.hpp
quality.hpp
genesis::sequence::error_probability_to_phred_score
unsigned char error_probability_to_phred_score(double error_probability)
Definition: quality.cpp:502