A toolkit for working with phylogenetic data.
v0.24.0
quality.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <cmath>
42 #include <cstdint>
43 #include <stdexcept>
44 
45 #ifdef GENESIS_AVX
46  #include <immintrin.h>
47 #endif
48 
49 namespace genesis {
50 namespace sequence {
51 
52 // =================================================================================================
53 // Quality Encoding and Decoding
54 // =================================================================================================
55 
59 inline void throw_invalid_quality_code_( char quality_code, QualityEncoding encoding )
60 {
61  throw std::invalid_argument(
62  "Invalid quality code: " + utils::char_to_hex( quality_code ) +
63  " is not in the valid range for " + quality_encoding_name( encoding ) + " quality codes."
64  );
65 }
66 
67 std::string quality_encoding_name( QualityEncoding encoding )
68 {
69  switch( encoding ) {
71  return "Sanger";
73  return "Illumina 1.3+";
75  return "Illumina 1.5+";
77  return "Illumina 1.8+";
79  return "Solexa";
80 
81  default:
82  throw std::invalid_argument( "Invalid quality encoding type." );
83  };
84 }
85 
86 unsigned char quality_decode_to_phred_score( char quality_code, QualityEncoding encoding )
87 {
88  // Convert using an offset. It's as simple as that.
89  // Except that we have different offsets for different fastq encoding styles.
90  // And also, Solexa needs special treatment, as we internally use phred scores only.
91  // Basically, fastq is again one of those weird bioinformatics file formats that drives
92  // everyone crazy by being ill-defined, and having contraticting variants and conventions...
93  // NB: We do not check for upper bounds of the scores here, as higher-quality scores can come
94  // from downstream processing.
95  switch( encoding ) {
98  if( quality_code < 33 || quality_code >= 127 ) {
99  throw_invalid_quality_code_( quality_code, encoding );
100  }
101  return static_cast<unsigned char>( quality_code ) - 33;
102 
105  if( quality_code < 64 || quality_code >= 127 ) {
106  throw_invalid_quality_code_( quality_code, encoding );
107  }
108  return static_cast<unsigned char>( quality_code ) - 64;
109 
111  if( quality_code < 59 || quality_code >= 127 ) {
112  throw_invalid_quality_code_( quality_code, encoding );
113  }
114  return solexa_score_to_phred_score( static_cast<unsigned char>( quality_code ) - 64 );
115 
116  default:
117  throw std::invalid_argument( "Invalid quality encoding type." );
118  };
119 }
120 
121 std::vector<unsigned char> quality_decode_to_phred_score(
122  std::string const& quality_codes,
123  QualityEncoding encoding
124 ) {
125  // Reserve space for the result. Do this first, to facilitate copy elision.
126  auto result = std::vector<unsigned char>( quality_codes.size() );
127 
128  // Only switch on the encoding once, for speed. We use a fake offset for Solexa,
129  // as they can go in the negative range to -5. Doing it this way makes our error checking
130  // code consistent. We correct for this later in the Solxa conversion loop below.
131  unsigned char offset;
132  switch( encoding ) {
135  offset = 33;
136  break;
137 
140  offset = 64;
141  break;
142 
144  offset = 59;
145  break;
146 
147  default:
148  throw std::invalid_argument( "Invalid quality encoding type." );
149  };
150 
151  // We use this as an indicator whether we found an error in the sequence.
152  // This allows to run the inner loops without branching.
153  bool good = true;
154 
155  // We use a loop variable here that is shared between the AVX and normal version,
156  // so that we do not have to duplicate the loop.
157  size_t i = 0;
158 
159  // In the loops below, we convert Solexa as if it was phred, and fix this later.
160  // This avoids code duplication for the error checking.
161 
162  #ifdef GENESIS_AVX
163 
164  // Get the data in the right format
165  auto const* data = reinterpret_cast<__m256i const*>( quality_codes.c_str() );
166  auto* write = reinterpret_cast<__m256i*>( result.data() );
167 
168  // Define some masks. We use 32 byte full of the offset, and full of the upper limit for
169  // ascii-encoded phred scores.
170  auto const m_offset = _mm256_set1_epi8( offset );
171  auto static const m_upper = _mm256_set1_epi8( 126 );
172 
173  // Process in 32-byte chunks. The loop condition uses integer division / 32, so we miss the rest
174  // that is above a multiple of 32. We process this later.
175  for( size_t j = 0; j < quality_codes.size() / 32; ++j ) {
176 
177  // Load the data and compare it to the offset and upper limit.
178  auto const m_data = _mm256_loadu_si256( data + j );
179  auto const m_min = _mm256_cmpgt_epi8( m_offset, m_data );
180  auto const m_max = _mm256_cmpgt_epi8( m_data, m_upper );
181 
182  // If any char is below the offset or above the upper limit, we have an error.
183  // We just store the result of the test, to avoid branching.
184  good &= _mm256_testz_si256( m_min, m_min ) && _mm256_testz_si256( m_max, m_max );
185 
186  // Subtract the offset, and store the result.
187  auto const m_result = _mm256_sub_epi8( m_data, m_offset );
188  _mm256_storeu_si256( write + j, m_result );
189  }
190 
191  // Set i to what it needs to be so that the loop below processes the remaining chars that are
192  // left in the string after AVX is done with the 32 byte chunks.
193  i = quality_codes.size() / 32 * 32;
194 
195  #endif // GENESIS_AVX
196 
197  // Run the conversion. Again, we avoid branching in the inner loop.
198  // i is already initialized, either from before the AVX code, or from within the AVX code.
199  // This avoids duplicating the below loop for the rest of the chars that are in the remainder
200  // of the string after AVX has processed all 32 byte chunks.
201  for( ; i < quality_codes.size(); ++i ) {
202  good &= ( quality_codes[i] >= offset ) && ( quality_codes[i] <= 126 );
203  result[i] = static_cast<unsigned char>( quality_codes[i] ) - offset;
204  }
205 
206  // If we found an error in the sequence, throw an exception. We run the whole sequence again,
207  // to get the exact char that is bad. This is only in the error case, so we do not care about speed.
208  if( !good ) {
209  for( size_t i = 0; i < quality_codes.size(); ++i ) {
210  if( quality_codes[i] < offset || quality_codes[i] >= 127 ) {
211  throw_invalid_quality_code_( quality_codes[i], encoding );
212  }
213  }
214 
215  // We cannot get here, otherwise, the !good condition would not have led us here.
216  assert( false );
217  }
218 
219  // Use 64bit words instead of vectorization. Maybe later, as an alternative to AVX.
220  // Macros from https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
221  //
222  // #define hasless(x,n) (((x)-~static_cast<uint64_t>(0)/255U*(n))&~(x)&~static_cast<uint64_t>(0)/255U*128)
223  // #define hasmore(x,n) (((x)+~static_cast<uint64_t>(0)/255U*(127-(n))|(x))&~static_cast<uint64_t>(0)/255U*128)
224  //
225  // auto const subtrahend = ~static_cast<uint64_t>(0) / 255U * offset;
226  //
227  // auto const* data = reinterpret_cast<uint64_t const*>( quality_codes.c_str() );
228  // auto* write = reinterpret_cast<uint64_t*>( result.data() );
229  // bool failed = false;
230  //
231  // for( size_t i = 0; i < quality_codes.size() / 8; ++i ) {
232  // bool const l = hasless( data[i], offset );
233  // bool const m = hasmore( data[i], 126 );
234  // failed |= l | m;
235  //
236  // write[i] = data[i] - subtrahend;
237  // }
238  //
239  // #undef hasless
240  // #undef hasmore
241  //
242  // if( failed ) {
243  // throw std::invalid_argument(
244  // "Invalid quality code that is not in the valid range for " +
245  // quality_encoding_name( encoding ) + " quality codes found in sequence."
246  // );
247  // }
248 
249  // For Solexa, we iterate the sequence again in order to convert it to phred.
250  // This is slower and could be avoided with a bit of code duplication, but no one uses that
251  // format anyway any more, so that case should be rare.
252  if( encoding == QualityEncoding::kSolexa ) {
253  for( size_t i = 0; i < quality_codes.size(); ++i ) {
254  result[i] = solexa_score_to_phred_score( result[i] - 5 );
255  }
256  }
257 
258  // Finally, we are done.
259  return result;
260 }
261 
262 // =================================================================================================
263 // Guess Quality Encoding Type
264 // =================================================================================================
265 
266 QualityEncoding guess_fastq_quality_encoding( std::array<size_t, 128> const& char_counts )
267 {
268  // Find the first entry that is not 0
269  size_t min = 0;
270  for( size_t i = 0; i < char_counts.size(); ++i ) {
271  if( char_counts[i] > 0 ) {
272  min = i;
273  break;
274  }
275  }
276 
277  // Find the last entry that is not 0
278  size_t max = 128;
279  for( size_t i = char_counts.size(); i > 0; --i ) {
280  if( char_counts[i-1] > 0 ) {
281  max = i-1;
282  break;
283  }
284  }
285 
286  // Some checks.
287  if( min < 33 || max >= 127 ) {
288  throw std::runtime_error(
289  "Invalid char counts provided to guess quality score encoding. Only printable "
290  "characters (ASCII range 33 to 127) are allowed in fastq quality encodings."
291  );
292  }
293  assert( min <= max );
294 
295  // Sanger and Illumina 1.8 use an offset of 33. The next higher offset is 64, but with Solexa ranging into the negative until -5, we find that anything below 64-5=59 cannot have the 64 offset, and hence must have the 33 offset.
296  if( min < 59 ) {
297  // Sanger and Illumina 1.8 are basically the same, so it does not make a difference
298  // whether we detect them correctly or not. However, we can still try to guess, for
299  // completeness. Illumina 1.8 seems to have one more character that can be used. Thus, if
300  // this character occurs, we can be sure. If not, it might just be that no base was that
301  // accurate. But then, it doesn't really matter anyway.
302  if( max > 73 ) {
304  } else {
306  }
307  }
308 
309  // Solexa goes down to a score of -5, with an offset of 64 for 0, so anything below 64 is
310  // negative, meaning that it cannot be Illumina 1.3 or 1.5.
311  if( min < 64 ) {
313  }
314 
315  // At this point, we could use a heuristic to test how common 'B' is, which is special in Illumina 1.5,
316  // see https://github.com/brentp/bio-playground/blob/master/reads-utils/guess-encoding.py for
317  // details. This would enable more fine-grained distinction between Illumina 1.3 and 1.5.
318  // But for now, we simply assume that an encoding without anything before 'B' is Illumina 1.5
319  if( min < 66 ) {
321  }
323 }
324 
325 QualityEncoding guess_fastq_quality_encoding( std::shared_ptr< utils::BaseInputSource > source )
326 {
327  // Init a counting array for each char, use value-initialization to 0
328  auto char_counts = std::array<size_t, 128>();
329 
330  // Prepare a reader that simply increments all char counts for the quality chars
331  // that are found in the sequences.
332  auto reader = FastqReader();
333  reader.quality_string_plugin(
334  [&]( std::string const& quality_string, Sequence& sequence )
335  {
336  (void) sequence;
337  for( auto q : quality_string ) {
338 
339  // Cast, so that we catch the issue that char can be signed or unsigned,
340  // depending on the compiler. This might cause unnecessary warnings otherwise.
341  auto qu = static_cast<unsigned char>(q);
342  if( qu > 127 ) {
343  throw std::invalid_argument(
344  "Invalid quality score character outside of the ASCII range."
345  );
346  }
347 
348  // assert( (qu & ~0x7f) == 0 );
349  assert( qu < char_counts.size() );
350  ++char_counts[ qu ];
351  }
352  }
353  );
354 
355  // Read the input, sequence by sequence.
356  utils::InputStream input_stream( source );
357  Sequence seq;
358  while( reader.parse_sequence( input_stream, seq ) ) {
359  // Do nothing. All the work is done in the plugin function above.
360  }
361 
362  // Return our guess based on the quality characters that were found in the sequences.
363  return guess_fastq_quality_encoding( char_counts );
364 }
365 
366 // =================================================================================================
367 // Quality Computations
368 // =================================================================================================
369 
370 unsigned char error_probability_to_phred_score( double error_probability )
371 {
372  if( ! std::isfinite( error_probability ) || error_probability < 0.0 || error_probability > 1.0 ) {
373  throw std::invalid_argument(
374  "Cannot convert error probability outside of range [0.0, 1.0] to phred score."
375  );
376  }
377 
378  // Compute the value and put into the valid range for unsigned chars. This might exceed
379  // the encoding that is later used to store the scores in fastq, but this does not concern us
380  // here. Instead, we offer the full range here, and clamp later to the value range when encoding.
381  double v = std::round( -10.0 * std::log10( error_probability ));
382  v = std::min( v, 255.0 );
383  return static_cast<unsigned char>( v );
384 }
385 
386 double phred_score_to_error_probability( unsigned char phred_score )
387 {
388  return std::pow( 10.0, static_cast<double>( phred_score ) / -10.0 );
389 }
390 
391 signed char error_probability_to_solexa_score( double error_probability )
392 {
393  if( ! std::isfinite( error_probability ) || error_probability < 0.0 || error_probability > 1.0 ) {
394  throw std::invalid_argument(
395  "Cannot convert error probability outside of range [0.0, 1.0] to solexa score."
396  );
397  }
398 
399  // The following are the limits than can be encoded in typical fastq-solexa encoding.
400  // We are not using them here, but instead use more relaxed limts, and will apply the actual
401  // limits only when encoding to fastq.
402  // // min that can be encoded in fastq with solexa encoding
403  // if( error_probability < 6.30957344e-7 ) {
404  // return 62;
405  // }
406  // // max that can be encoded in fastq with solexa encoding
407  // if( error_probability > 0.75 ) {
408  // return -5;
409  // }
410 
411  // Compute the score, and check its boundaries. We did a check before, so this is duplication,
412  // but leads to nicer user output.
413  double v = std::round( -10.0 * std::log10( error_probability / ( 1.0 - error_probability )));
414  v = std::min( v, 127.0 );
415  v = std::max( v, -128.0 );
416  return static_cast<signed char>( v );
417 }
418 
419 double solexa_score_to_error_probability( signed char solexa_score )
420 {
421  if( solexa_score < -5 ) {
422  solexa_score = -5;
423  }
424  double const t = std::pow( 10.0, static_cast<double>( solexa_score ) / -10.0 );
425  return t / ( 1.0 + t );
426 }
427 
428 signed char phred_score_to_solexa_score( unsigned char phred_score )
429 {
430  if( phred_score <= 1 ) {
431  return -5;
432  }
433  double v = std::round(
434  10.0 * std::log10( std::pow( 10.0, static_cast<double>( phred_score ) / 10.0 ) - 1.0 )
435  );
436  v = std::min( v, 127.0 );
437  return static_cast<signed char>( v );
438 }
439 
440 unsigned char solexa_score_to_phred_score( signed char solexa_score )
441 {
442  // This cannot overflow, so no need to check.
443  return static_cast<unsigned char>( std::round(
444  10.0 * std::log10( std::pow( 10.0, static_cast<double>( solexa_score ) / 10.0 ) + 1.0 )
445  ));
446 }
447 
448 } // namespace sequence
449 } // namespace genesis
void offset(Histogram &h, double value)
Definition: operations.cpp:47
unsigned char solexa_score_to_phred_score(signed char solexa_score)
Definition: quality.cpp:440
Read Fastq sequence data.
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score...
Definition: quality.cpp:86
void throw_invalid_quality_code_(char quality_code, QualityEncoding encoding)
Local helper function to throw if any invalid fastq quality chars are being used. ...
Definition: quality.cpp:59
unsigned char error_probability_to_phred_score(double error_probability)
Definition: quality.cpp:370
signed char error_probability_to_solexa_score(double error_probability)
Definition: quality.cpp:391
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string quality_encoding_name(QualityEncoding encoding)
Return a readable name for each of the encoding types.
Definition: quality.cpp:67
double solexa_score_to_error_probability(signed char solexa_score)
Definition: quality.cpp:419
QualityEncoding guess_fastq_quality_encoding(std::array< size_t, 128 > const &char_counts)
Guess the quality score encoding, based on counts of how often each char appeared in the quality stri...
Definition: quality.cpp:266
double phred_score_to_error_probability(unsigned char phred_score)
Definition: quality.cpp:386
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:71
signed char phred_score_to_solexa_score(unsigned char phred_score)
Definition: quality.cpp:428
Stream interface for reading data from an InputSource, that keeps track of line and column counters...