A library for working with phylogenetic and population genetic data.
v0.32.0
simple_pileup_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
39 
40 #include <cassert>
41 #include <cstdlib>
42 #include <cstring>
43 #include <stdexcept>
44 
45 namespace genesis {
46 namespace population {
47 
48 // =================================================================================================
49 // Reading Records
50 // =================================================================================================
51 
52 std::vector<SimplePileupReader::Record> SimplePileupReader::read_records(
53  std::shared_ptr< utils::BaseInputSource > source
54 ) const {
55  std::vector<SimplePileupReader::Record> result;
56  utils::InputStream it( source );
57 
58  // Reset quality code counts.
59  quality_code_counts_ = std::array<size_t, 128>{};
60 
61  // Read until end of input, pushing copies into the result
62  // (moving would not reduce the number of times that we need to allocate memory here).
63  Record rec;
64  while( parse_line_( it, rec, {}, false )) {
65  result.push_back( rec );
66  }
67  return result;
68 }
69 
70 // std::vector<SimplePileupReader::Record> SimplePileupReader::read_records(
71 // std::shared_ptr< utils::BaseInputSource > source,
72 // std::vector<size_t> const& sample_indices
73 // ) const {
74 // std::vector<SimplePileupReader::Record> result;
75 // utils::InputStream it( source );
76 //
77 // // Convert the list of indices to a bool vec that tells which samples we want to process.
78 // auto const sample_filter = utils::make_bool_vector_from_indices( sample_indices );
79 //
80 // Record rec;
81 // while( parse_line_( it, rec, sample_filter, true )) {
82 // result.push_back( rec );
83 // }
84 // return result;
85 // }
86 
87 std::vector<SimplePileupReader::Record> SimplePileupReader::read_records(
88  std::shared_ptr< utils::BaseInputSource > source,
89  std::vector<bool> const& sample_filter
90 ) const {
91  std::vector<SimplePileupReader::Record> result;
92  utils::InputStream it( source );
93 
94  // Reset quality code counts.
95  quality_code_counts_ = std::array<size_t, 128>{};
96 
97  // Read until end of input, pushing copies into the result
98  // (moving would not reduce the number of times that we need to allocate memory here).
99  Record rec;
100  while( parse_line_( it, rec, sample_filter, true )) {
101  result.push_back( rec );
102  }
103  return result;
104 }
105 
106 // =================================================================================================
107 // Reading Variants
108 // =================================================================================================
109 
111  std::shared_ptr< utils::BaseInputSource > source
112 ) const {
113  std::vector<Variant> result;
114  utils::InputStream it( source );
115 
116  // Read until end of input, pushing copies into the result
117  // (moving would not reduce the number of times that we need to allocate memory here).
118  Variant var;
119  while( parse_line_( it, var, {}, false )) {
120  result.push_back( var );
121  }
122  return result;
123 }
124 
125 // std::vector<Variant> SimplePileupReader::read_variants(
126 // std::shared_ptr< utils::BaseInputSource > source,
127 // std::vector<size_t> const& sample_indices
128 // ) const {
129 // std::vector<Variant> result;
130 // utils::InputStream it( source );
131 //
132 // // Convert the list of indices to a bool vec that tells which samples we want to process.
133 // auto const sample_filter = utils::make_bool_vector_from_indices( sample_indices );
134 //
135 // Variant var;
136 // while( parse_line_( it, var, sample_filter, true )) {
137 // result.push_back( var );
138 // }
139 // return result;
140 // }
141 
143  std::shared_ptr< utils::BaseInputSource > source,
144  std::vector<bool> const& sample_filter
145 ) const {
146  std::vector<Variant> result;
147  utils::InputStream it( source );
148 
149  // Read until end of input, pushing copies into the result
150  // (moving would not reduce the number of times that we need to allocate memory here).
151  Variant var;
152  while( parse_line_( it, var, sample_filter, true )) {
153  result.push_back( var );
154  }
155  return result;
156 }
157 
158 // =================================================================================================
159 // Parsing Records
160 // =================================================================================================
161 
163  utils::InputStream& input_stream,
165 ) const {
166  return parse_line_( input_stream, record, {}, false );
167 }
168 
170  utils::InputStream& input_stream,
172  std::vector<bool> const& sample_filter
173 ) const {
174  return parse_line_( input_stream, record, sample_filter, true );
175 }
176 
177 // =================================================================================================
178 // Parsing Variants
179 // =================================================================================================
180 
182  utils::InputStream& input_stream,
183  Variant& variant
184 ) const {
185  reset_status_( variant );
186  return parse_line_( input_stream, variant, {}, false );
187 }
188 
190  utils::InputStream& input_stream,
191  Variant& variant,
192  std::vector<bool> const& sample_filter
193 ) const {
194  reset_status_( variant );
195  return parse_line_( input_stream, variant, sample_filter, true );
196 }
197 
198 // =================================================================================================
199 // Internal Members
200 // =================================================================================================
201 
202 // -------------------------------------------------------------------------
203 // Reset Status
204 // -------------------------------------------------------------------------
205 
206 void SimplePileupReader::reset_status_( Variant& variant ) const
207 {
208  variant.status.reset();
209  for( auto& sample : variant.samples ) {
210  sample.status.reset();
211  }
212 }
213 
214 // -------------------------------------------------------------------------
215 // Parse Line
216 // -------------------------------------------------------------------------
217 
218 template<class T>
219 bool SimplePileupReader::parse_line_(
220  utils::InputStream& input_stream,
221  T& target,
222  std::vector<bool> const& sample_filter,
223  bool use_sample_filter
224 ) const {
225  // Shorthand.
226  auto& it = input_stream;
227 
228  // If we reached the end of the input stream, reset the target. We do not reset per default,
229  // in order to avoid costly re-initialization of the sample vector. But when we finish with
230  // an input stream, we want to reset, so that subsequent usage of this reader class does not
231  // fail if the pileup file contains a different number of samples.
232  // Still, the user will currently get an error when using the same reader instance to
233  // simultaneously (interlaced) read from multiple pileup files with differing number of samples
234  // into the same target... But who does that?! If you are a user having this issue, please
235  // let me know!
236  if( !it ) {
237  target = T();
238  return false;
239  }
240  assert( it );
241  if( *it == '\n' ) {
242  throw std::runtime_error(
243  "Malformed pileup " + it.source_name() + " at " + it.at() + ": Invalid empty line"
244  );
245  }
246 
247  // Read chromosome.
249  target.chromosome = utils::read_while( it, utils::is_graph );
250  if( target.chromosome.empty() ) {
251  throw std::runtime_error(
252  "Malformed pileup " + it.source_name() + " at " + it.at() +
253  ": empty chromosome name"
254  );
255  }
256  assert( !it || !utils::is_graph( *it ));
257 
258  // Read position.
259  next_field_( it );
260  target.position = utils::parse_unsigned_integer<size_t>( it );
261  if( target.position == 0 ) {
262  throw std::runtime_error(
263  "Malformed pileup " + it.source_name() + " at " + it.at() +
264  ": chromosome position == 0, while pileup demands 1-based positions"
265  );
266  }
267  assert( !it || !utils::is_digit( *it ));
268 
269  // Read reference base. We also set the alternative base, just in case, to make sure that
270  // it has the value that we need it to have in absence of actual data.
271  next_field_( it );
272  auto rb = utils::to_upper( *it );
273  if( ! is_valid_base_or_n( rb )) {
274  if( strict_bases_ ) {
275  throw std::runtime_error(
276  "Malformed pileup " + it.source_name() + " at " + it.at() +
277  ": Invalid reference base that is not in [ACGTN]"
278  );
279  } else {
280  rb = 'N';
281  }
282  }
283  target.reference_base = rb;
284  set_target_alternative_base_( target );
285  ++it;
286 
287  // Read the samples. We switch once for the first line, and thereafter check that we read the
288  // same number of samples each time.
289  size_t src_index = 0;
290  if( target.samples.empty() ) {
291  while( it && *it != '\n' ) {
292  if(
293  ! use_sample_filter ||
294  ( src_index < sample_filter.size() && sample_filter[src_index] )
295  ) {
296  target.samples.emplace_back();
297  process_sample_( it, target, target.samples.back() );
298  } else {
299  skip_sample_( it );
300  }
301  ++src_index;
302  }
303  } else {
304  // Here we need two indices, one over the samples in the file (source),
305  // and one for the samples that we are writing in our Record (destination).
306  size_t dst_index = 0;
307  while( it && *it != '\n' ) {
308  if(
309  ! use_sample_filter ||
310  ( src_index < sample_filter.size() && sample_filter[src_index] )
311  ) {
312  if( dst_index >= target.samples.size() ) {
313  throw std::runtime_error(
314  "Malformed pileup " + it.source_name() + " at " + it.at() +
315  ": Line with different number of samples."
316  );
317  }
318  assert( dst_index < target.samples.size() );
319 
320  process_sample_( it, target, target.samples[dst_index] );
321  ++dst_index;
322  } else {
323  skip_sample_( it );
324  }
325  ++src_index;
326  }
327  if( dst_index != target.samples.size() ) {
328  throw std::runtime_error(
329  "Malformed pileup " + it.source_name() + " at " + it.at() +
330  ": Line with different number of samples."
331  );
332  }
333  }
334  if( use_sample_filter && src_index != sample_filter.size() ) {
335  throw std::runtime_error(
336  "Malformed pileup " + it.source_name() + " at " + it.at() +
337  ": Number of samples in the line does not match the number of filter entries."
338  );
339  }
340 
341  assert( !it || *it == '\n' );
342  ++it;
343  return true;
344 }
345 
346 // -------------------------------------------------------------------------
347 // Process Sample
348 // -------------------------------------------------------------------------
349 
350 template<class T, class S>
351 void SimplePileupReader::process_sample_(
352  utils::InputStream& input_stream,
353  T const& target,
354  S& sample
355 ) const {
356  // Shorthand.
357  auto& it = input_stream;
358 
359  // Reset the sample.
360  sample = S();
361 
362  // Read the total read depth / coverage.
363  next_field_( it );
364  auto const read_depth = utils::parse_unsigned_integer<size_t>( it );
365  set_sample_read_depth_( read_depth, sample );
366  assert( !it || !utils::is_digit( *it ));
367 
368  // Read the nucleotides, skipping everything that we don't want. We need to store these
369  // in a string first, as we want to do quality checks. Bit unfortunate, and maybe there
370  // is a smart way to avoid this for cases without quality string (without code duplication).
371  // Goot enough for now though.
372  // We use two processing methods, a fast one based on the input buffer if possible,
373  // and if that failes (because the end of the bases is beyond the buffer end),
374  // we run a slower version again that does not work on the buffer, but that should be rare.
375  next_field_( it );
376  auto const done_reading_bases = process_sample_read_bases_buffer_( it, target.reference_base );
377  if( ! done_reading_bases ) {
378  // Try again with the slow method.
379  process_sample_read_bases_stream_( it, target.reference_base );
380  }
381  set_sample_read_bases_( base_buffer_, sample );
382 
383  // Read depth count error check. We here allow for the same weird special case of a deletion
384  // that does not count for the depth, as described in convert_to_sample_counts().
385  if(
386  base_buffer_.size() != read_depth &&
387  !( read_depth == 0 && base_buffer_.size() == 1 )
388  ) {
389  throw std::runtime_error(
390  "Malformed pileup " + it.source_name() + " at " + it.at() +
391  ": Given read count (" + std::to_string( read_depth ) +
392  ") does not match the number of bases found in the sample (" +
393  std::to_string( base_buffer_.size() ) + ")."
394  );
395  }
396 
397  // Now read the quality codes, if present.
398  process_quality_string_( it, sample );
399 
400  // Also check if we want to read the ancestral base, if present.
401  process_ancestral_base_( it, sample );
402 
403  // Final file sanity checks.
404  if( it && !( utils::is_blank( *it ) || utils::is_newline( *it ))) {
405  throw std::runtime_error(
406  "Malformed pileup " + it.source_name() + " at " + it.at() +
407  ": Invalid characters."
408  );
409  }
410 }
411 
412 // -------------------------------------------------------------------------
413 // set_target_alternative_base_
414 // -------------------------------------------------------------------------
415 
416 template<>
417 void SimplePileupReader::set_target_alternative_base_<SimplePileupReader::Record>(
419 ) const {
420  // The pileup format does not have an alterantive base, so we do nothing here.
421  (void) target;
422 }
423 
424 template<>
425 void SimplePileupReader::set_target_alternative_base_<Variant>(
426  Variant& target
427 ) const {
428  // The format does not have an ancestral base,
429  // but we want to make sure that it is set to a defined value in the Variant.
430  target.alternative_base = 'N';
431 }
432 
433 // -------------------------------------------------------------------------
434 // set_sample_read_depth_
435 // -------------------------------------------------------------------------
436 
437 template<>
438 void SimplePileupReader::set_sample_read_depth_<SimplePileupReader::Sample>(
439  size_t read_depth,
441 ) const {
442  sample.read_depth = read_depth;
443 }
444 
445 template<>
446 void SimplePileupReader::set_sample_read_depth_<SampleCounts>(
447  size_t read_depth,
448  SampleCounts& sample
449 ) const {
450  // Variant SampleCounts don't use read depth
451  (void) read_depth;
452  (void) sample;
453 }
454 
455 // -------------------------------------------------------------------------
456 // process_sample_read_bases_buffer_
457 // -------------------------------------------------------------------------
458 
459 bool SimplePileupReader::process_sample_read_bases_buffer_(
460  utils::InputStream& input_stream,
461  char reference_base
462 ) const {
463  // Shorthand.
464  auto& it = input_stream;
465  auto const in_buff = input_stream.buffer();
466  base_buffer_.clear();
467 
468  // No need to compute upper and lower case again and again here.
469  auto const u_ref_base = utils::to_upper( reference_base );
470  auto const l_ref_base = utils::to_lower( reference_base );
471 
472  // Go through the bases and store them in the buffer,
473  // keeping track of the position (pos) in the buffer.
474  size_t pos = 0;
475  while( pos < in_buff.second ) {
476 
477  // Stop when we reach the end of the bases.
478  if( ! utils::is_graph( in_buff.first[pos] )) {
479  break;
480  }
481 
482  // Process the char. This is some code duplication with process_sample_read_bases_stream_(),
483  // but the methods process the stream/buffer differently for each of those choices here,
484  // so not much we can do here without too much effort...
485  switch( in_buff.first[pos] ) {
486  case '+':
487  case '-': {
488  // A sequence matching `[+-][0-9]+[ACGTNacgtn]+` is an insertion or deletion.
489  // We skip/ignore those, following the format definition to get the valid chars.
490  // See http://www.htslib.org/doc/samtools-mpileup.html
491  static const std::string allowed_codes = "ACGTN*#";
492 
493  // First, we need to get how many chars there are in this indel.
494  ++pos;
495  if( pos >= in_buff.second ) {
496  // If we reached the end of the buffer here, we do not have enough chars
497  // in the buffer to continue here... use the slow method instead.
498  // This means that we will go to the next buffer block there, and hence will
499  // likely return to this faster method for the next sample again then.
500  return false;
501  }
502  errno = 0;
503  char* endptr;
504  auto const indel_cnt = std::strtoul( &in_buff.first[pos], &endptr, 10 );
505  if( errno == ERANGE || &in_buff.first[pos] == endptr ) {
506  throw std::runtime_error(
507  "Malformed pileup " + it.source_name() + " near " + it.at() +
508  ": Line with invalid indel characters count that is not a valid number."
509  );
510  }
511 
512  // We have read a number. Go to the next char in the buffer after that.
513  pos += endptr - &in_buff.first[pos];
514  if( pos >= in_buff.second ) {
515  // Same as before: Buffer not large enough,
516  // need to do slow method for this sample.
517  return false;
518  }
519 
520  // Now, we skip as many chars as the number we read, making sure that all is in order.
521  for( size_t i = 0; i < indel_cnt; ++i ) {
522  if( pos >= in_buff.second ) {
523  // Same as before: Buffer not large enough,
524  // need to do slow method for this sample.
525  return false;
526  }
527  if(
528  strict_bases_ &&
529  !std::strchr( allowed_codes.c_str(), utils::to_upper( in_buff.first[pos]))
530  ) {
531  throw std::runtime_error(
532  "Malformed pileup " + it.source_name() + " near " + it.at() +
533  ": Line with invalid indel character " +
534  utils::char_to_hex( in_buff.first[pos])
535  );
536  }
537  ++pos;
538  }
539  break;
540  }
541  case '^': {
542  // Caret marks the start of a read segment, followed by a char for the mapping
543  // quality. We skip both of these.
544  ++pos;
545  if( pos >= in_buff.second ) {
546  // Same as above: Buffer not large enough,
547  // need to do slow method for this sample.
548  return false;
549  }
550  ++pos;
551  break;
552  }
553  case '$': {
554  // Dollar marks the end of a read segment. Skip.
555  ++pos;
556  break;
557  }
558  case '.': {
559  // pileup wants '.' to be the ref base in upper case...
560  base_buffer_ += u_ref_base;
561  ++pos;
562  break;
563  }
564  case ',': {
565  // ...and ',' to be the ref base in lower case
566  base_buffer_ += l_ref_base;
567  ++pos;
568  break;
569  }
570  default: {
571  // Everything else we simply add as-is.
572  base_buffer_ += in_buff.first[pos];
573  ++pos;
574  break;
575  }
576  }
577  }
578  assert( pos == in_buff.second || !utils::is_graph( in_buff.first[pos] ) );
579 
580  // Now we have reached the end of the buffer-based approach.
581  // If that worked, that is, if we are not at the end of the buffer, and so have found
582  // the end of the bases, we are good and can move to the end of what we just read.
583  // If not, we return false in order to do a second, slower pass, to catch everything.
584  if( pos < in_buff.second ) {
585  // We stopped the above loop on any non-graph char, such as new lines,
586  // so this jump never goes across one, which is important.
587  it.jump_unchecked( pos );
588  return true;
589  }
590  return false;
591 }
592 
593 // -------------------------------------------------------------------------
594 // process_sample_read_bases_stream_
595 // -------------------------------------------------------------------------
596 
597 void SimplePileupReader::process_sample_read_bases_stream_(
598  utils::InputStream& input_stream,
599  char reference_base
600 ) const {
601  // Shorthand.
602  auto& it = input_stream;
603  base_buffer_.clear();
604 
605  while( it && utils::is_graph( *it )) {
606  auto const c = *it;
607  switch( c ) {
608  case '+':
609  case '-': {
610  // A sequence matching `[+-][0-9]+[ACGTNacgtn]+` is an insertion or deletion.
611  // We skip/ignore those.
612 
613  // We first here allowed degenerated chars, and made a buffer for the codes
614  // that are allowed in indels - like this:
615  // static const std::string allowed_codes = sequence::nucleic_acid_codes_all_letters();
616 
617  // But later, we changed this to use the the proper pileup definition here,
618  // see http://www.htslib.org/doc/samtools-mpileup.html
619  static const std::string allowed_codes = "ACGTN*#";
620 
621  // First, we need to get how many chars there are in this indel.
622  ++it;
623  size_t const indel_cnt = utils::parse_unsigned_integer<size_t>( it );
624 
625  // Then, we skip that many chars, making sure that all is in order.
626  for( size_t i = 0; i < indel_cnt; ++i ) {
627  if( !it ) {
628  throw std::runtime_error(
629  "Malformed pileup " + it.source_name() + " at " + it.at() +
630  ": Line with missing indel characters."
631  );
632  }
633  if(
634  strict_bases_ &&
635  !std::strchr( allowed_codes.c_str(), utils::to_upper( *it ))
636  ) {
637  throw std::runtime_error(
638  "Malformed pileup " + it.source_name() + " at " + it.at() +
639  ": Line with invalid indel character " + utils::char_to_hex( *it )
640  );
641  }
642  ++it;
643  }
644  break;
645  }
646  case '^': {
647  // Caret marks the start of a read segment, followed by a char for the mapping
648  // quality. We skip both of these.
649  ++it;
650  if( !it ) {
651  throw std::runtime_error(
652  "Malformed pileup " + it.source_name() + " at " + it.at() +
653  ": Line with invalid start of read segment marker"
654  );
655  }
656  ++it;
657  break;
658  }
659  case '$': {
660  // Dollar marks the end of a read segment. Skip.
661  ++it;
662  break;
663  }
664  case '.': {
665  // pileup wants '.' to be the ref base in upper case...
666  base_buffer_ += utils::to_upper( reference_base );
667  ++it;
668  break;
669  }
670  case ',': {
671  // ...and ',' to be the ref base in lower case
672  base_buffer_ += utils::to_lower( reference_base );
673  ++it;
674  break;
675  }
676  default: {
677  // Everything else we simply add as-is.
678  base_buffer_ += c;
679  ++it;
680  break;
681  }
682  }
683  }
684  assert( !it || !utils::is_graph( *it ) );
685 }
686 
687 // -------------------------------------------------------------------------
688 // set_sample_read_bases_
689 // -------------------------------------------------------------------------
690 
691 template<>
692 void SimplePileupReader::set_sample_read_bases_<SimplePileupReader::Sample>(
693  std::string const& read_bases,
695 ) const {
696  sample.read_bases = read_bases;
697 }
698 
699 template<>
700 void SimplePileupReader::set_sample_read_bases_<SampleCounts>(
701  std::string const& read_bases,
702  SampleCounts& sample
703 ) const {
704  // Variant SampleCounts don't use read bases
705  (void) read_bases;
706  (void) sample;
707 }
708 
709 // -------------------------------------------------------------------------
710 // process_quality_string_
711 // -------------------------------------------------------------------------
712 
713 template<>
714 void SimplePileupReader::process_quality_string_<SimplePileupReader::Sample>(
715  utils::InputStream& input_stream,
717 ) const {
718  // Shorthand.
719  auto& it = input_stream;
720 
721  // Now read the quality codes, if present.
722  if( with_quality_string_ ) {
723  next_field_( it );
724  sample.phred_scores.reserve( sample.read_depth );
725  while( it && utils::is_graph( *it )) {
726  ++quality_code_counts_[*it];
728  *it, quality_encoding_
729  ));
730  ++it;
731  }
732  assert( !it || !utils::is_graph( *it ) );
733 
734  // Version that processes the whole string at once. Not much time saved, as we need
735  // to allocate the string first. Maybe refine later for speed.
736  // std::string qual;
737  // qual.reserve( sample.read_depth );
738  // while( it && utils::is_graph( *it )) {
739  // qual += *it;
740  // ++it;
741  // }
742  // sample.phred_scores = sequence::quality_decode_to_phred_score( qual, quality_encoding_ );
743 
744  if( sample.read_bases.size() != sample.phred_scores.size() ) {
745  throw std::runtime_error(
746  "Malformed pileup " + it.source_name() + " at " + it.at() +
747  ": Line contains " + std::to_string( sample.read_bases.size() ) + " bases, but " +
748  std::to_string( sample.phred_scores.size() ) + " quality score codes."
749  );
750  }
751  }
752  assert( sample.phred_scores.empty() || sample.read_bases.size() == sample.phred_scores.size() );
753  assert( !it || !utils::is_graph( *it ) );
754 }
755 
756 template<>
757 void SimplePileupReader::process_quality_string_<SampleCounts>(
758  utils::InputStream& input_stream,
759  SampleCounts& sample
760 ) const {
761  // Shorthand.
762  auto& it = input_stream;
763 
764  // Now read the quality codes, if present.
765  if( with_quality_string_ ) {
766  next_field_( it );
767 
768  // We use the internal buffer of the input stream for speed.
769  // If that failes, because we reach the end of it before finishing the field here,
770  // we do a slow pass, but that should be rare.
771  auto const in_buff = it.buffer();
772 
773  // The counts should not have been touched yet. We started with a fresh SampleCounts,
774  // and this function is the only one that calls tally_base_(), so all counts should be 0.
775  assert(
776  sample.a_count == 0 && sample.c_count == 0 && sample.g_count == 0 &&
777  sample.t_count == 0 && sample.n_count == 0 && sample.d_count == 0
778  );
779 
780  // Go through the quality scores, and tally up the bases that have a high enough quality,
781  // keeping track of the position (pos) in the buffer.
782  size_t pos = 0;
783  for( ; pos < in_buff.second; ++pos ) {
784  // Stop when we reach the end of the quality scores.
785  if( ! utils::is_graph( in_buff.first[pos] )) {
786  break;
787  }
788 
789  // Check that we do not read more quality than we have bases.
790  if( pos >= base_buffer_.size() ) {
791  throw std::runtime_error(
792  "Malformed pileup " + it.source_name() + " at " + it.at() +
793  ": Line contains " + std::to_string( base_buffer_.size() ) + " bases, but " +
794  std::to_string( pos ) + " or more quality score codes."
795  );
796  }
797 
798  // Process the score, and tally up its base if the score is high enough.
799  if( min_base_quality_ > 0 ) {
800  auto const score = sequence::quality_decode_to_phred_score(
801  in_buff.first[pos], quality_encoding_
802  );
803  if( score >= min_base_quality_ ) {
804  tally_base_( it, sample, base_buffer_[pos] );
805  }
806  } else {
807  // If the min qual is 0, we do not need to check and convert the score,
808  // as it will pass that threshold anyway.
809  tally_base_( it, sample, base_buffer_[pos] );
810  }
811  }
812  assert( pos == in_buff.second || !utils::is_graph( in_buff.first[pos] ) );
813 
814  // Now we have reached the end of the buffer-based approach.
815  // If that worked, that is, if we are not at the end of the buffer, and so have found
816  // the end of the quality scores, we are good and can move to the end of what we just read.
817  // If not, we reset the counts and do a second, slower pass, to catch everything.
818  if( pos < in_buff.second ) {
819  // We stopped the above loop on any non-graph char, such as new lines,
820  // so this jump never goes across one, which is important.
821  it.jump_unchecked( pos );
822  } else {
823  // Reset
824  pos = 0;
825  sample = SampleCounts();
826 
827  // Go through the quality scores, and tally up the bases that have a high enough quality,
828  // keeping track of the position (pos) in the buffer.
829  while( it && utils::is_graph( *it )) {
830  if( pos >= base_buffer_.size() ) {
831  throw std::runtime_error(
832  "Malformed pileup " + it.source_name() + " at " + it.at() +
833  ": Line contains " + std::to_string( base_buffer_.size() ) + " bases, but " +
834  std::to_string( pos ) + " or more quality score codes."
835  );
836  }
837 
838  // Process the score, and tally up its base if the score is high enough.
839  auto const score = sequence::quality_decode_to_phred_score( *it, quality_encoding_ );
840  if( score >= min_base_quality_ ) {
841  tally_base_( it, sample, base_buffer_[pos] );
842  }
843 
844  ++pos;
845  ++it;
846  }
847  assert( !it || !utils::is_graph( *it ) );
848  }
849 
850  // Last check: Did we reach exactly as many quality codes as we have bases?
851  if( pos != base_buffer_.size() ) {
852  throw std::runtime_error(
853  "Malformed pileup " + it.source_name() + " at " + it.at() +
854  ": Line contains " + std::to_string( base_buffer_.size() ) + " bases, but " +
855  std::to_string( pos ) + " quality score codes."
856  );
857  }
858  } else {
859  // Without quality scores, simply tally up all the bases.
860  // This is the part that could be optimized by not storing the bases in a string first.
861  for( auto const c : base_buffer_ ) {
862  tally_base_( it, sample, c );
863  }
864  }
865  assert( !it || !utils::is_graph( *it ) );
866 }
867 
868 inline void SimplePileupReader::tally_base_(
869  utils::InputStream& input_stream,
870  SampleCounts& sample,
871  char b
872 ) const {
873  switch( b ) {
874  case 'a':
875  case 'A': {
876  ++sample.a_count;
877  break;
878  }
879  case 'c':
880  case 'C': {
881  ++sample.c_count;
882  break;
883  }
884  case 'g':
885  case 'G': {
886  ++sample.g_count;
887  break;
888  }
889  case 't':
890  case 'T': {
891  ++sample.t_count;
892  break;
893  }
894  case 'n':
895  case 'N': {
896  ++sample.n_count;
897  break;
898  }
899  case '*':
900  case '#': {
901  ++sample.d_count;
902  break;
903  }
904  case '<':
905  case '>': {
906  break;
907  }
908  default: {
909  throw std::runtime_error(
910  "Malformed pileup " + input_stream.source_name() + " near " + input_stream.at() +
911  ": Invalid allele character " + utils::char_to_hex( b )
912  );
913  }
914  }
915 }
916 
917 // -------------------------------------------------------------------------
918 // process_ancestral_base_
919 // -------------------------------------------------------------------------
920 
921 template<>
922 void SimplePileupReader::process_ancestral_base_<SimplePileupReader::Sample>(
923  utils::InputStream& input_stream,
925 ) const {
926  // Shorthand.
927  auto& it = input_stream;
928 
929  if( with_ancestral_base_ ) {
930  next_field_( it );
931  // We can simply read in the char here. Even if the iterator is at its end, it will
932  // simply return a null char, which will trigger the subsequent error check.
933  char ab = utils::to_upper( *it );
934  if( !it || is_valid_base_or_n( ab )) {
935  if( strict_bases_ ) {
936  throw std::runtime_error(
937  "Malformed pileup " + it.source_name() + " at " + it.at() +
938  ": Expecting ancestral base character in [ACGTN]."
939  );
940  } else {
941  ab = 'N';
942  }
943  }
944  sample.ancestral_base = ab;
945  ++it;
946  }
947 }
948 
949 template<>
950 void SimplePileupReader::process_ancestral_base_<SampleCounts>(
951  utils::InputStream& input_stream,
952  SampleCounts& sample
953 ) const {
954  (void) input_stream;
955  (void) sample;
956 
957  // Also check if we want to read the ancestral base, if present.
958  if( with_ancestral_base_ ) {
959  // throw std::runtime_error(
960  // "SimplePileupReader currently does not implement to read (m)pileup files "
961  // "with ancestral bases when reading Variants."
962  // );
963 
964  // Let's simply read and ignore the ancestral base, as our Variant/SampleCounts setup
965  // does not store those at the moment.
966  // For simplicity and to avoid code duplication, we just call the other version of this
967  // function with a dummy Sample. This is not super efficient, but given how rare pileups
968  // with ancestral base are, this is totally fine for now. Can optimize later.
969  SimplePileupReader::Sample dummy;
970  process_ancestral_base_( input_stream, dummy );
971  }
972 }
973 
974 // -------------------------------------------------------------------------
975 // skip_sample_
976 // -------------------------------------------------------------------------
977 
978 void SimplePileupReader::skip_sample_(
979  utils::InputStream& input_stream
980 ) const {
981  // Shorthand.
982  auto& it = input_stream;
983 
984  // Read the total read count / coverage.
985  next_field_( it );
987  assert( !it || !utils::is_digit( *it ));
988 
989  // Read the nucleotides.
990  next_field_( it );
992  assert( !it || !utils::is_graph( *it ) );
993 
994  // Read the quality codes, if present.
995  if( with_quality_string_ ) {
996  next_field_( it );
998  }
999  assert( !it || !utils::is_graph( *it ) );
1000 
1001  // Read the ancestral base, if present.
1002  if( with_ancestral_base_ ) {
1003  next_field_( it );
1005  }
1006  assert( !it || !utils::is_graph( *it ) );
1007 
1008  // Final file sanity checks.
1009  if( it && !( utils::is_blank( *it ) || utils::is_newline( *it ))) {
1010  throw std::runtime_error(
1011  "Malformed pileup " + it.source_name() + " at " + it.at() +
1012  ": Invalid characters."
1013  );
1014  }
1015 }
1016 
1017 // -------------------------------------------------------------------------
1018 // next_field_
1019 // -------------------------------------------------------------------------
1020 
1021 void SimplePileupReader::next_field_( utils::InputStream& input_stream ) const
1022 {
1023  // There needs to be at last some whitespace that separates the field. Affirm that,
1024  // then skip it until we are at the content of the next field.
1025  // utils::affirm_char_or_throw( input_stream, utils::is_blank );
1026  // utils::skip_while( input_stream, utils::is_blank );
1027  // assert( !input_stream || !utils::is_blank( *input_stream ));
1028 
1029  // Nope, the above skips empty fields, which can occurr when there are no bases
1030  // at a position at all. Let's just follow the standard more strictly, and check for a tab.
1031  utils::affirm_char_or_throw( input_stream, '\t' );
1032  ++input_stream;
1033 }
1034 
1035 } // namespace population
1036 } // namespace genesis
genesis::sequence::quality_decode_to_phred_score
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score.
Definition: quality.cpp:221
genesis::utils::is_graph
constexpr bool is_graph(char c) noexcept
Return whether a char is a character with graphical representation, according to isgraph of the cctyp...
Definition: char.hpp:166
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::population::SimplePileupReader::parse_line_variant
bool parse_line_variant(utils::InputStream &input_stream, Variant &variant) const
Read an (m)pileup line, as a Variant.
Definition: simple_pileup_reader.cpp:181
parser.hpp
functions.hpp
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
helper.hpp
genesis::population::SimplePileupReader::Sample::read_depth
size_t read_depth
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:117
genesis::population::SimplePileupReader::read_variants
std::vector< Variant > read_variants(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as Variants.
Definition: simple_pileup_reader.cpp:110
genesis::utils::affirm_char_or_throw
void affirm_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that checks whether the current char from the stream equals the provided one.
Definition: scanner.hpp:385
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::is_valid_base_or_n
constexpr bool is_valid_base_or_n(char c)
Return whether a given base is in ACGTN, case insensitive.
Definition: population/function/functions.hpp:71
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:103
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:152
genesis::utils::is_newline
constexpr bool is_newline(char c) noexcept
Return whether a char is either a new line or a carriage return character.
Definition: char.hpp:182
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
simple_pileup_reader.hpp
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis::population::FilterStatus::reset
void reset()
Definition: filter_status.hpp:117
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:126
char.hpp
genesis::utils::is_blank
constexpr bool is_blank(char c) noexcept
Return whether a char is either a space or a tab character.
Definition: char.hpp:174
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::SimplePileupReader::parse_line_record
bool parse_line_record(utils::InputStream &input_stream, Record &record) const
Read an (m)pileup line, as a Record.
Definition: simple_pileup_reader.cpp:162
genesis::population::SimplePileupReader::Sample::ancestral_base
char ancestral_base
Base of the ancestral allele.
Definition: simple_pileup_reader.hpp:142
scanner.hpp
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:135
genesis::population::SimplePileupReader::read_records
std::vector< Record > read_records(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as pileup Records.
Definition: simple_pileup_reader.cpp:52
codes.hpp
genesis::population::Variant::status
FilterStatus status
Definition: variant.hpp:76