A library for working with phylogenetic and population genetic data.
v0.32.0
quality.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
39 
40 #include <algorithm>
41 #include <array>
42 #include <cassert>
43 #include <cmath>
44 #include <cstdint>
45 #include <stdexcept>
46 
47 #if defined(GENESIS_AVX) || defined(GENESIS_AVX2) || defined(GENESIS_AVX512)
48 
49  #include <immintrin.h>
50 
51 #endif
52 
53 namespace genesis {
54 namespace sequence {
55 
56 // =================================================================================================
57 // Phred Score To Error Probability
58 // =================================================================================================
59 
60 // Local array for faster lookup of phred score error probabilities.
61 // Code to generate the array:
62 //
63 // #include <iostream>
64 // std::cout.precision( 15 );
65 // size_t c = 0;
66 // for( size_t i = 0; i < 256; ++i ) {
67 // if( ++c == 4 || i % 10 == 0 ) {
68 // c = 0;
69 // std::cout << "\n ";
70 // }
71 // if( i % 10 == 0 ) {
72 // std::cout << "/* " << i << " */ ";
73 // }
74 // std::cout << std::pow( 10.0, static_cast<double>( i ) / -10.0 ) << ", ";
75 // }
76 // std::cout << "\n";
77 //
78 // See phred_score_to_error_probability() for usage. We probably never ever need the values above 93,
79 // as those cannot even be encoded in normal ASCII-based phred score codes, and also probably
80 // don't even need values above 60, which is usually taken as an upper bound. But let's be through
81 // here, and include the whole range of possible values for our data type.
82 static const std::array<double, 256> phred_score_to_error_probability_lookup_ = {{
83  /* 0 */ 1.0, 0.794328234724281, 0.630957344480193, 0.501187233627272,
84  0.398107170553497, 0.316227766016838, 0.251188643150958, 0.199526231496888,
85  0.158489319246111, 0.125892541179417,
86  /* 10 */ 0.1, 0.0794328234724281, 0.0630957344480193, 0.0501187233627272,
87  0.0398107170553497, 0.0316227766016838, 0.0251188643150958, 0.0199526231496888,
88  0.0158489319246111, 0.0125892541179417,
89  /* 20 */ 0.01, 0.00794328234724281, 0.00630957344480193, 0.00501187233627272,
90  0.00398107170553497, 0.00316227766016838, 0.00251188643150958, 0.00199526231496888,
91  0.00158489319246111, 0.00125892541179417,
92  /* 30 */ 0.001, 0.000794328234724281, 0.000630957344480193, 0.000501187233627273,
93  0.000398107170553497, 0.000316227766016838, 0.000251188643150958, 0.000199526231496888,
94  0.000158489319246111, 0.000125892541179417,
95  /* 40 */ 0.0001, 7.94328234724282e-05, 6.30957344480193e-05, 5.01187233627273e-05,
96  3.98107170553497e-05, 3.16227766016838e-05, 2.51188643150958e-05, 1.99526231496888e-05,
97  1.58489319246111e-05, 1.25892541179417e-05,
98  /* 50 */ 1e-05, 7.94328234724282e-06, 6.30957344480193e-06, 5.01187233627272e-06,
99  3.98107170553497e-06, 3.16227766016838e-06, 2.51188643150958e-06, 1.99526231496888e-06,
100  1.58489319246111e-06, 1.25892541179417e-06,
101  /* 60 */ 1e-06, 7.94328234724282e-07, 6.30957344480193e-07, 5.01187233627272e-07,
102  3.98107170553497e-07, 3.16227766016838e-07, 2.51188643150958e-07, 1.99526231496888e-07,
103  1.58489319246111e-07, 1.25892541179417e-07,
104  /* 70 */ 1e-07, 7.94328234724282e-08, 6.30957344480193e-08, 5.01187233627272e-08,
105  3.98107170553497e-08, 3.16227766016838e-08, 2.51188643150958e-08, 1.99526231496888e-08,
106  1.58489319246111e-08, 1.25892541179417e-08,
107  /* 80 */ 1e-08, 7.94328234724282e-09, 6.30957344480194e-09, 5.01187233627271e-09,
108  3.98107170553497e-09, 3.16227766016838e-09, 2.51188643150958e-09, 1.99526231496888e-09,
109  1.58489319246111e-09, 1.25892541179417e-09,
110  /* 90 */ 1e-09, 7.94328234724282e-10, 6.30957344480194e-10, 5.01187233627271e-10,
111  3.98107170553497e-10, 3.16227766016838e-10, 2.51188643150958e-10, 1.99526231496888e-10,
112  1.58489319246111e-10, 1.25892541179417e-10,
113  /* 100 */ 1e-10, 7.94328234724282e-11, 6.30957344480194e-11, 5.01187233627271e-11,
114  3.98107170553497e-11, 3.16227766016838e-11, 2.51188643150958e-11, 1.99526231496888e-11,
115  1.58489319246111e-11, 1.25892541179417e-11,
116  /* 110 */ 1e-11, 7.94328234724282e-12, 6.30957344480194e-12, 5.01187233627271e-12,
117  3.98107170553497e-12, 3.16227766016838e-12, 2.51188643150958e-12, 1.99526231496888e-12,
118  1.58489319246111e-12, 1.25892541179417e-12,
119  /* 120 */ 1e-12, 7.94328234724282e-13, 6.30957344480194e-13, 5.01187233627271e-13,
120  3.98107170553497e-13, 3.16227766016838e-13, 2.51188643150958e-13, 1.99526231496888e-13,
121  1.58489319246111e-13, 1.25892541179417e-13,
122  /* 130 */ 1e-13, 7.94328234724282e-14, 6.30957344480194e-14, 5.01187233627271e-14,
123  3.98107170553497e-14, 3.16227766016838e-14, 2.51188643150958e-14, 1.99526231496888e-14,
124  1.58489319246111e-14, 1.25892541179417e-14,
125  /* 140 */ 1e-14, 7.94328234724282e-15, 6.30957344480194e-15, 5.01187233627271e-15,
126  3.98107170553497e-15, 3.16227766016838e-15, 2.51188643150958e-15, 1.99526231496888e-15,
127  1.58489319246111e-15, 1.25892541179417e-15,
128  /* 150 */ 1e-15, 7.94328234724282e-16, 6.30957344480194e-16, 5.01187233627271e-16,
129  3.98107170553497e-16, 3.16227766016838e-16, 2.51188643150958e-16, 1.99526231496888e-16,
130  1.58489319246111e-16, 1.25892541179417e-16,
131  /* 160 */ 1e-16, 7.94328234724279e-17, 6.30957344480194e-17, 5.01187233627271e-17,
132  3.98107170553499e-17, 3.16227766016838e-17, 2.51188643150957e-17, 1.99526231496888e-17,
133  1.58489319246111e-17, 1.25892541179417e-17,
134  /* 170 */ 1e-17, 7.94328234724279e-18, 6.30957344480194e-18, 5.01187233627271e-18,
135  3.98107170553499e-18, 3.16227766016838e-18, 2.51188643150957e-18, 1.99526231496888e-18,
136  1.58489319246111e-18, 1.25892541179417e-18,
137  /* 180 */ 1e-18, 7.94328234724279e-19, 6.30957344480194e-19, 5.01187233627271e-19,
138  3.98107170553499e-19, 3.16227766016838e-19, 2.51188643150957e-19, 1.99526231496888e-19,
139  1.58489319246111e-19, 1.25892541179417e-19,
140  /* 190 */ 1e-19, 7.94328234724279e-20, 6.30957344480194e-20, 5.01187233627271e-20,
141  3.98107170553499e-20, 3.16227766016838e-20, 2.51188643150957e-20, 1.99526231496888e-20,
142  1.58489319246111e-20, 1.25892541179417e-20,
143  /* 200 */ 1e-20, 7.94328234724279e-21, 6.30957344480194e-21, 5.01187233627271e-21,
144  3.98107170553499e-21, 3.16227766016838e-21, 2.51188643150957e-21, 1.99526231496888e-21,
145  1.58489319246111e-21, 1.25892541179417e-21,
146  /* 210 */ 1e-21, 7.94328234724279e-22, 6.30957344480194e-22, 5.01187233627272e-22,
147  3.98107170553499e-22, 3.16227766016838e-22, 2.51188643150957e-22, 1.99526231496888e-22,
148  1.58489319246111e-22, 1.25892541179417e-22,
149  /* 220 */ 1e-22, 7.94328234724279e-23, 6.30957344480194e-23, 5.01187233627271e-23,
150  3.98107170553499e-23, 3.16227766016838e-23, 2.51188643150957e-23, 1.99526231496888e-23,
151  1.58489319246111e-23, 1.25892541179417e-23,
152  /* 230 */ 1e-23, 7.94328234724279e-24, 6.30957344480194e-24, 5.01187233627271e-24,
153  3.98107170553499e-24, 3.16227766016838e-24, 2.51188643150957e-24, 1.99526231496888e-24,
154  1.58489319246111e-24, 1.25892541179417e-24,
155  /* 240 */ 1e-24, 7.94328234724279e-25, 6.30957344480194e-25, 5.01187233627272e-25,
156  3.98107170553499e-25, 3.16227766016838e-25, 2.51188643150957e-25, 1.99526231496888e-25,
157  1.58489319246111e-25, 1.25892541179417e-25,
158  /* 250 */ 1e-25, 7.94328234724279e-26, 6.30957344480194e-26, 5.01187233627271e-26,
159  3.98107170553499e-26, 3.16227766016838e-26
160 }};
161 
162 // =================================================================================================
163 // Quality Encoding and Decoding
164 // =================================================================================================
165 
169 inline void throw_invalid_quality_code_( char quality_code, QualityEncoding encoding )
170 {
171  throw std::invalid_argument(
172  "Invalid quality code: " + utils::char_to_hex( quality_code ) +
173  " is not in the valid range for " + quality_encoding_name( encoding ) + " quality codes."
174  );
175 }
176 
177 std::string quality_encoding_name( QualityEncoding encoding, bool with_offset )
178 {
179  auto const wo = with_offset;
180  switch( encoding ) {
182  return std::string( "Sanger" ) + std::string( wo ? " (ASCII offset 33)" : "" );
184  return std::string( "Illumina 1.3+" ) + std::string( wo ? " (ASCII offset 64)" : "" );
186  return std::string( "Illumina 1.5+" ) + std::string( wo ? " (ASCII offset 64)" : "" );
188  return std::string( "Illumina 1.8+" ) + std::string( wo ? " (ASCII offset 33)" : "" );
190  return std::string( "Solexa" ) + std::string( wo ? " (ASCII offset 64)" : "" );
191 
192  default:
193  throw std::invalid_argument( "Invalid quality encoding type." );
194  };
195 }
196 
198 {
199  // Transform the string into a uniform form.
200  auto s = utils::to_lower( name );
201  s.erase( std::remove_if(s.begin(), s.end(), []( char c ){
202  return ! utils::is_alnum( c );
203  }), s.end() );
204 
205  // Match the enum.
206  if( s == "sanger" ) {
208  } else if( s == "illumina13" ) {
210  } else if( s == "illumina15" ) {
212  } else if( s == "illumina18" || s == "illumina" ) {
214  } else if( s == "solexa" ) {
216  } else {
217  throw std::runtime_error( "Invalid quality encoding name: \"" + name + "\"" );
218  }
219 }
220 
221 unsigned char quality_decode_to_phred_score( char quality_code, QualityEncoding encoding )
222 {
223  // Convert using an offset. It's as simple as that.
224  // Except that we have different offsets for different fastq encoding styles.
225  // And also, Solexa needs special treatment, as we internally use phred scores only.
226  // Basically, fastq is again one of those weird bioinformatics file formats that drives
227  // everyone crazy by being ill-defined, and having contraticting variants and conventions...
228  // NB: We do not check for upper bounds of the scores here, as higher-quality scores can come
229  // from downstream processing.
230  switch( encoding ) {
233  if( quality_code < 33 || quality_code >= 127 ) {
234  throw_invalid_quality_code_( quality_code, encoding );
235  }
236  return static_cast<unsigned char>( quality_code ) - 33;
237 
240  if( quality_code < 64 || quality_code >= 127 ) {
241  throw_invalid_quality_code_( quality_code, encoding );
242  }
243  return static_cast<unsigned char>( quality_code ) - 64;
244 
246  if( quality_code < 59 || quality_code >= 127 ) {
247  throw_invalid_quality_code_( quality_code, encoding );
248  }
249  return solexa_score_to_phred_score( static_cast<signed char>( quality_code ) - 64 );
250 
251  default:
252  throw std::invalid_argument( "Invalid quality encoding type." );
253  };
254 }
255 
256 std::vector<unsigned char> quality_decode_to_phred_score(
257  std::string const& quality_codes,
258  QualityEncoding encoding
259 ) {
260  // Reserve space for the result. Do this first, to facilitate copy elision.
261  auto result = std::vector<unsigned char>( quality_codes.size() );
262 
263  // Only switch on the encoding once, for speed. We use a fake offset for Solexa,
264  // as they can go in the negative range to -5. Doing it this way makes our error checking
265  // code consistent. We correct for this later in the Solxa conversion loop below.
266  unsigned char offset;
267  switch( encoding ) {
270  offset = 33;
271  break;
272 
275  offset = 64;
276  break;
277 
279  offset = 59;
280  break;
281 
282  default:
283  throw std::invalid_argument( "Invalid quality encoding type." );
284  };
285 
286  // We use this as an indicator whether we found an error in the sequence.
287  // This allows to run the inner loops without branching.
288  bool good = true;
289 
290  // We use a loop variable here that is shared between the AVX and normal version,
291  // so that we do not have to duplicate the loop.
292  size_t i = 0;
293 
294  // In the loops below, we convert Solexa as if it was phred, and fix this later.
295  // This avoids code duplication for the error checking.
296 
297  #ifdef GENESIS_AVX2
298 
299  // Get the data in the right format
300  auto const* data = reinterpret_cast<__m256i const*>( quality_codes.c_str() );
301  auto* write = reinterpret_cast<__m256i*>( result.data() );
302 
303  // Define some masks. We use 32 byte full of the offset, and full of the upper limit for
304  // ascii-encoded phred scores.
305  auto const m_offset = _mm256_set1_epi8( offset );
306  auto static const m_upper = _mm256_set1_epi8( 126 );
307 
308  // Process in 32-byte chunks. The loop condition uses integer division / 32,
309  // so we miss the rest that is above a multiple of 32. We process this later.
310  for( size_t j = 0; j < quality_codes.size() / 32; ++j ) {
311 
312  // Load the data and compare it to the offset and upper limit.
313  auto const m_data = _mm256_loadu_si256( data + j );
314  auto const m_min = _mm256_cmpgt_epi8( m_offset, m_data );
315  auto const m_max = _mm256_cmpgt_epi8( m_data, m_upper );
316 
317  // If any char is below the offset or above the upper limit, we have an error.
318  // We just store the result of the test, to avoid branching.
319  good &= _mm256_testz_si256( m_min, m_min ) && _mm256_testz_si256( m_max, m_max );
320 
321  // Subtract the offset, and store the result.
322  auto const m_result = _mm256_sub_epi8( m_data, m_offset );
323  _mm256_storeu_si256( write + j, m_result );
324  }
325 
326  // Set i to what it needs to be so that the loop below processes the remaining chars
327  // that are left in the string after AVX is done with the 32 byte chunks.
328  i = quality_codes.size() / 32 * 32;
329 
330  #endif // GENESIS_AVX2
331 
332  // Run the conversion. Again, we avoid branching in the inner loop.
333  // i is already initialized, either from before the AVX code, or from within the AVX code.
334  // This avoids duplicating the below loop for the rest of the chars that are in the remainder
335  // of the string after AVX has processed all 32 byte chunks.
336  for( ; i < quality_codes.size(); ++i ) {
337  good &= ( quality_codes[i] >= offset ) && ( quality_codes[i] <= 126 );
338  result[i] = static_cast<unsigned char>( quality_codes[i] ) - offset;
339  }
340 
341  // If we found an error in the sequence, throw an exception. We run the whole sequence again,
342  // to get the exact char that is bad. This is only in the error case, so we do not care about speed.
343  if( !good ) {
344  for( size_t i = 0; i < quality_codes.size(); ++i ) {
345  if( quality_codes[i] < offset || quality_codes[i] >= 127 ) {
346  throw_invalid_quality_code_( quality_codes[i], encoding );
347  }
348  }
349 
350  // We cannot get here, otherwise, the !good condition would not have led us here.
351  assert( false );
352  }
353 
354  // Use 64bit words instead of vectorization. Maybe later, as an alternative to AVX.
355  // Macros from https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
356  //
357  // #define hasless(x,n) (((x)-~static_cast<uint64_t>(0)/255U*(n))&~(x)&~static_cast<uint64_t>(0)/255U*128)
358  // #define hasmore(x,n) (((x)+~static_cast<uint64_t>(0)/255U*(127-(n))|(x))&~static_cast<uint64_t>(0)/255U*128)
359  //
360  // auto const subtrahend = ~static_cast<uint64_t>(0) / 255U * offset;
361  //
362  // auto const* data = reinterpret_cast<uint64_t const*>( quality_codes.c_str() );
363  // auto* write = reinterpret_cast<uint64_t*>( result.data() );
364  // bool failed = false;
365  //
366  // for( size_t i = 0; i < quality_codes.size() / 8; ++i ) {
367  // bool const l = hasless( data[i], offset );
368  // bool const m = hasmore( data[i], 126 );
369  // failed |= l | m;
370  //
371  // write[i] = data[i] - subtrahend;
372  // }
373  //
374  // #undef hasless
375  // #undef hasmore
376  //
377  // if( failed ) {
378  // throw std::invalid_argument(
379  // "Invalid quality code that is not in the valid range for " +
380  // quality_encoding_name( encoding ) + " quality codes found in sequence."
381  // );
382  // }
383 
384  // For Solexa, we iterate the sequence again in order to convert it to phred.
385  // This is slower and could be avoided with a bit of code duplication, but no one uses that
386  // format anyway any more, so that case should be rare.
387  if( encoding == QualityEncoding::kSolexa ) {
388  for( size_t i = 0; i < quality_codes.size(); ++i ) {
389  result[i] = solexa_score_to_phred_score( result[i] - 5 );
390  }
391  }
392 
393  // Finally, we are done.
394  return result;
395 }
396 
397 // =================================================================================================
398 // Guess Quality Encoding Type
399 // =================================================================================================
400 
402 {
403  // Check rhs.
404  if(
405  rhs != QualityEncoding::kSanger &&
410  ) {
411  throw std::invalid_argument( "Invalid quality encoding type." );
412  }
413 
414  // Check lhs, and return compatibility.
415  switch( lhs ) {
419 
423 
425  return rhs == QualityEncoding::kSolexa;
426 
427  default:
428  throw std::invalid_argument( "Invalid quality encoding type." );
429  };
430  return false;
431 }
432 
433 QualityEncoding guess_quality_encoding( std::array<size_t, 128> const& char_counts )
434 {
435  // Find the first entry that is not 0
436  size_t min = 0;
437  for( size_t i = 0; i < char_counts.size(); ++i ) {
438  if( char_counts[i] > 0 ) {
439  min = i;
440  break;
441  }
442  }
443 
444  // Find the last entry that is not 0
445  size_t max = 128;
446  for( size_t i = char_counts.size(); i > 0; --i ) {
447  if( char_counts[i-1] > 0 ) {
448  max = i-1;
449  break;
450  }
451  }
452 
453  // Some checks.
454  if( min == char_counts.size() ) {
455  assert( max == 0 );
456  throw std::runtime_error(
457  "Cannot guess quality encoding, as all quality code counts are zero."
458  );
459  }
460  if( min < 33 || max >= 127 ) {
461  throw std::runtime_error(
462  "Invalid char counts provided to guess quality score encoding. Only printable "
463  "characters (ASCII range 33 to 127) are allowed in fastq quality encodings."
464  );
465  }
466  assert( min <= max );
467 
468  // Sanger and Illumina 1.8 use an offset of 33. The next higher offset is 64, but with Solexa ranging into the negative until -5, we find that anything below 64-5=59 cannot have the 64 offset, and hence must have the 33 offset.
469  if( min < 59 ) {
470  // Sanger and Illumina 1.8 are basically the same, so it does not make a difference
471  // whether we detect them correctly or not. However, we can still try to guess, for
472  // completeness. Illumina 1.8 seems to have one more character that can be used. Thus, if
473  // this character occurs, we can be sure. If not, it might just be that no base was that
474  // accurate. But then, it doesn't really matter anyway.
475  if( max > 73 ) {
477  } else {
479  }
480  }
481 
482  // Solexa goes down to a score of -5, with an offset of 64 for 0, so anything below 64 is
483  // negative, meaning that it cannot be Illumina 1.3 or 1.5.
484  if( min < 64 ) {
486  }
487 
488  // At this point, we could use a heuristic to test how common 'B' is, which is special in Illumina 1.5,
489  // see https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py for
490  // details. This would enable more fine-grained distinction between Illumina 1.3 and 1.5.
491  // But for now, we simply assume that an encoding without anything before 'B' is Illumina 1.5
492  if( min < 66 ) {
494  }
496 }
497 
499  std::shared_ptr< utils::BaseInputSource > source,
500  size_t max_lines,
501  size_t max_chars
502 ) {
503  // Init a counting array for each char, use value-initialization to 0
504  auto char_counts = std::array<size_t, 128>();
505  size_t lines_cnt = 0;
506  size_t chars_cnt = 0;
507 
508  // Prepare a reader that simply increments all char counts for the quality chars
509  // that are found in the sequences.
510  auto reader = FastqReader();
511  reader.quality_string_plugin(
512  [&]( std::string const& quality_string, Sequence& sequence )
513  {
514  (void) sequence;
515  for( auto q : quality_string ) {
516 
517  // Cast, so that we catch the issue that char can be signed or unsigned,
518  // depending on the compiler. This might cause unnecessary warnings otherwise.
519  auto qu = static_cast<unsigned char>(q);
520  if( qu > 127 ) {
521  throw std::invalid_argument(
522  "Invalid quality score character outside of the ASCII range."
523  );
524  }
525 
526  // Increment the count for the given character.
527  // assert( (qu & ~0x7f) == 0 );
528  assert( qu < char_counts.size() );
529  ++char_counts[ qu ];
530 
531  // Only count up to a limit of chars, if provided.
532  ++chars_cnt;
533  if( max_chars > 0 && chars_cnt > max_chars ) {
534  break;
535  }
536  }
537  }
538  );
539 
540  // Read the input, sequence by sequence.
541  utils::InputStream input_stream( source );
542  Sequence seq;
543  while( reader.parse_sequence( input_stream, seq ) ) {
544  // Here, we check the limits, if used.
545  if( max_chars > 0 && chars_cnt > max_chars ) {
546  break;
547  }
548 
549  // For lines, we need to stop once we have reached the limit, not one after,
550  // as the counting is dont in the preprocessing of the reader iterator already.
551  ++lines_cnt;
552  if( max_lines > 0 && lines_cnt >= max_lines ) {
553  break;
554  }
555  }
556 
557  // Return our guess based on the quality characters that were found in the sequences.
558  return guess_quality_encoding( char_counts );
559 }
560 
561 // =================================================================================================
562 // Quality Computations
563 // =================================================================================================
564 
565 unsigned char error_probability_to_phred_score( double error_probability )
566 {
567  if( ! std::isfinite( error_probability ) || error_probability < 0.0 || error_probability > 1.0 ) {
568  throw std::invalid_argument(
569  "Cannot convert error probability outside of range [0.0, 1.0] to phred score."
570  );
571  }
572 
573  // Compute the value and put into the valid range for unsigned chars. This might exceed
574  // the encoding that is later used to store the scores in fastq, but this does not concern us
575  // here. Instead, we offer the full range here, and clamp later to the value range when encoding.
576  double v = std::round( -10.0 * std::log10( error_probability ));
577  v = std::min( v, 255.0 );
578  return static_cast<unsigned char>( v );
579 }
580 
581 double phred_score_to_error_probability( unsigned char phred_score )
582 {
583  return phred_score_to_error_probability_lookup_[ phred_score ];
584  // return std::pow( 10.0, static_cast<double>( phred_score ) / -10.0 );
585 }
586 
587 signed char error_probability_to_solexa_score( double error_probability )
588 {
589  if( ! std::isfinite( error_probability ) || error_probability < 0.0 || error_probability > 1.0 ) {
590  throw std::invalid_argument(
591  "Cannot convert error probability outside of range [0.0, 1.0] to solexa score."
592  );
593  }
594 
595  // The following are the limits than can be encoded in typical fastq-solexa encoding.
596  // We are not using them here, but instead use more relaxed limts, and will apply the actual
597  // limits only when encoding to fastq.
598  // // min that can be encoded in fastq with solexa encoding
599  // if( error_probability < 6.30957344e-7 ) {
600  // return 62;
601  // }
602  // // max that can be encoded in fastq with solexa encoding
603  // if( error_probability > 0.75 ) {
604  // return -5;
605  // }
606 
607  // Compute the score, and check its boundaries. We did a check before, so this is duplication,
608  // but leads to nicer user output.
609  double v = std::round( -10.0 * std::log10( error_probability / ( 1.0 - error_probability )));
610  v = std::min( v, 127.0 );
611  v = std::max( v, -128.0 );
612  return static_cast<signed char>( v );
613 }
614 
615 double solexa_score_to_error_probability( signed char solexa_score )
616 {
617  if( solexa_score < -5 ) {
618  solexa_score = -5;
619  }
620  double const t = std::pow( 10.0, static_cast<double>( solexa_score ) / -10.0 );
621  return t / ( 1.0 + t );
622 }
623 
624 signed char phred_score_to_solexa_score( unsigned char phred_score )
625 {
626  if( phred_score <= 1 ) {
627  return -5;
628  }
629  double v = std::round(
630  10.0 * std::log10( std::pow( 10.0, static_cast<double>( phred_score ) / 10.0 ) - 1.0 )
631  );
632  v = std::min( v, 127.0 );
633  return static_cast<signed char>( v );
634 }
635 
636 unsigned char solexa_score_to_phred_score( signed char solexa_score )
637 {
638  // This cannot overflow, so no need to check.
639  return static_cast<unsigned char>( std::round(
640  10.0 * std::log10( std::pow( 10.0, static_cast<double>( solexa_score ) / 10.0 ) + 1.0 )
641  ));
642 }
643 
644 std::vector<unsigned char> error_probability_to_phred_score( std::vector<double> error_probability )
645 {
646  auto res = std::vector<unsigned char>( error_probability.size(), 0 );
647  for( size_t i = 0; i < error_probability.size(); ++i ) {
648  res[i] = error_probability_to_phred_score( error_probability[i] );
649  }
650  return res;
651 }
652 
653 std::vector<double> phred_score_to_error_probability( std::vector<unsigned char> phred_score )
654 {
655  auto res = std::vector<double>( phred_score.size(), 0 );
656  for( size_t i = 0; i < phred_score.size(); ++i ) {
657  res[i] = phred_score_to_error_probability( phred_score[i] );
658  }
659  return res;
660 }
661 
662 std::vector<signed char> error_probability_to_solexa_score( std::vector<double> error_probability )
663 {
664  auto res = std::vector<signed char>( error_probability.size(), 0 );
665  for( size_t i = 0; i < error_probability.size(); ++i ) {
666  res[i] = error_probability_to_solexa_score( error_probability[i] );
667  }
668  return res;
669 }
670 
671 std::vector<double> solexa_score_to_error_probability( std::vector<signed char> solexa_score )
672 {
673  auto res = std::vector<double>( solexa_score.size(), 0 );
674  for( size_t i = 0; i < solexa_score.size(); ++i ) {
675  res[i] = solexa_score_to_error_probability( solexa_score[i] );
676  }
677  return res;
678 }
679 
680 std::vector<signed char> phred_score_to_solexa_score( std::vector<unsigned char> phred_score )
681 {
682  auto res = std::vector<signed char>( phred_score.size(), 0 );
683  for( size_t i = 0; i < phred_score.size(); ++i ) {
684  res[i] = phred_score_to_solexa_score( phred_score[i] );
685  }
686  return res;
687 }
688 
689 std::vector<unsigned char> solexa_score_to_phred_score( std::vector<signed char> solexa_score )
690 {
691  auto res = std::vector<unsigned char>( solexa_score.size(), 0 );
692  for( size_t i = 0; i < solexa_score.size(); ++i ) {
693  res[i] = solexa_score_to_phred_score( solexa_score[i] );
694  }
695  return res;
696 }
697 
698 } // namespace sequence
699 } // namespace genesis
genesis::sequence::phred_score_to_error_probability
double phred_score_to_error_probability(unsigned char phred_score)
Definition: quality.cpp:581
genesis::sequence::quality_decode_to_phred_score
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score.
Definition: quality.cpp:221
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::sequence::phred_score_to_error_probability_lookup_
static const std::array< double, 256 > phred_score_to_error_probability_lookup_
Definition: quality.cpp:82
genesis::sequence::throw_invalid_quality_code_
void throw_invalid_quality_code_(char quality_code, QualityEncoding encoding)
Local helper function to throw if any invalid fastq quality chars are being used.
Definition: quality.cpp:169
fastq_reader.hpp
genesis::sequence::guess_quality_encoding_from_name
QualityEncoding guess_quality_encoding_from_name(std::string const &name)
Guess the QualityEncoding type, given its description name.
Definition: quality.cpp:197
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::quality_encoding_name
std::string quality_encoding_name(QualityEncoding encoding, bool with_offset)
Return a readable name for each of the encoding types.
Definition: quality.cpp:177
genesis::sequence::QualityEncoding::kSanger
@ kSanger
genesis::sequence::QualityEncoding::kIllumina15
@ kIllumina15
genesis::sequence::FastqReader
Read Fastq sequence data.
Definition: fastq_reader.hpp:149
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
genesis::sequence::QualityEncoding::kSolexa
@ kSolexa
input_source.hpp
genesis::sequence::QualityEncoding::kIllumina13
@ kIllumina13
genesis::sequence::compatible_quality_encodings
bool compatible_quality_encodings(QualityEncoding lhs, QualityEncoding rhs)
Return whether two quality encodings are compatible with each other.
Definition: quality.cpp:401
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::sequence::QualityEncoding::kIllumina18
@ kIllumina18
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::sequence::error_probability_to_solexa_score
signed char error_probability_to_solexa_score(double error_probability)
Definition: quality.cpp:587
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::guess_quality_encoding
QualityEncoding guess_quality_encoding(std::array< size_t, 128 > const &char_counts)
Guess the quality score encoding, based on counts of how often each char appeared in the quality stri...
Definition: quality.cpp:433
genesis::sequence::guess_fastq_quality_encoding
QualityEncoding guess_fastq_quality_encoding(std::shared_ptr< utils::BaseInputSource > source, size_t max_lines, size_t max_chars)
Guess the quality score encoding for a fastq input, based on counts of how often each char appeared i...
Definition: quality.cpp:498
char.hpp
genesis::sequence::phred_score_to_solexa_score
signed char phred_score_to_solexa_score(unsigned char phred_score)
Definition: quality.cpp:624
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::sequence::solexa_score_to_phred_score
unsigned char solexa_score_to_phred_score(signed char solexa_score)
Definition: quality.cpp:636
genesis::sequence::solexa_score_to_error_probability
double solexa_score_to_error_probability(signed char solexa_score)
Definition: quality.cpp:615
sequence.hpp
quality.hpp
genesis::sequence::error_probability_to_phred_score
unsigned char error_probability_to_phred_score(double error_probability)
Definition: quality.cpp:565