A library for working with phylogenetic and population genetic data.
v0.27.0
simple_pileup_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
38 
39 #include <cassert>
40 #include <cstdlib>
41 #include <cstring>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Reading Records
49 // =================================================================================================
50 
55  utils::InputStream const& it,
56  std::string& cur_chr, size_t& cur_pos,
57  std::string const& new_chr, size_t new_pos
58 ) {
59  if( new_chr < cur_chr || ( new_chr == cur_chr && new_pos <= cur_pos )) {
60  throw std::runtime_error(
61  "Malformed pileup " + it.source_name() + " at " + it.at() +
62  ": unordered chromosomes and positions"
63  );
64  }
65  cur_chr = new_chr;
66  cur_pos = new_pos;
67 }
68 
69 std::vector<SimplePileupReader::Record> SimplePileupReader::read_records(
70  std::shared_ptr< utils::BaseInputSource > source
71 ) const {
72  std::vector<SimplePileupReader::Record> result;
73  utils::InputStream it( source );
74 
75  // Read, with correct order check, just in case.
76  std::string cur_chr = "";
77  size_t cur_pos = 0;
78  Record rec;
79  while( parse_line_( it, rec, {}, false )) {
80  process_pileup_correct_input_order_check_( it, cur_chr, cur_pos, rec.chromosome, rec.position );
81  result.push_back( rec );
82  }
83  return result;
84 }
85 
86 // std::vector<SimplePileupReader::Record> SimplePileupReader::read_records(
87 // std::shared_ptr< utils::BaseInputSource > source,
88 // std::vector<size_t> const& sample_indices
89 // ) const {
90 // std::vector<SimplePileupReader::Record> result;
91 // utils::InputStream it( source );
92 //
93 // // Convert the list of indices to a bool vec that tells which samples we want to process.
94 // auto const sample_filter = utils::make_bool_vector_from_indices( sample_indices );
95 //
96 // // Read, with correct order check, just in case.
97 // std::string cur_chr = "";
98 // size_t cur_pos = 0;
99 // Record rec;
100 // while( parse_line_( it, rec, sample_filter, true )) {
101 // process_pileup_correct_input_order_check_( it, cur_chr, cur_pos, rec.chromosome, rec.position );
102 // result.push_back( rec );
103 // }
104 // return result;
105 // }
106 
107 std::vector<SimplePileupReader::Record> SimplePileupReader::read_records(
108  std::shared_ptr< utils::BaseInputSource > source,
109  std::vector<bool> const& sample_filter
110 ) const {
111  std::vector<SimplePileupReader::Record> result;
112  utils::InputStream it( source );
113 
114  // Read, with correct order check, just in case.
115  std::string cur_chr = "";
116  size_t cur_pos = 0;
117  Record rec;
118  while( parse_line_( it, rec, sample_filter, true )) {
119  process_pileup_correct_input_order_check_( it, cur_chr, cur_pos, rec.chromosome, rec.position );
120  result.push_back( rec );
121  }
122  return result;
123 }
124 
125 // =================================================================================================
126 // Reading Variants
127 // =================================================================================================
128 
130  std::shared_ptr< utils::BaseInputSource > source
131 ) const {
132  std::vector<Variant> result;
133  utils::InputStream it( source );
134 
135  // Read, with correct order check, just in case.
136  std::string cur_chr = "";
137  size_t cur_pos = 0;
138  Variant var;
139  while( parse_line_( it, var, {}, false )) {
140  process_pileup_correct_input_order_check_( it, cur_chr, cur_pos, var.chromosome, var.position );
141  result.push_back( var );
142  }
143  return result;
144 }
145 
146 // std::vector<Variant> SimplePileupReader::read_variants(
147 // std::shared_ptr< utils::BaseInputSource > source,
148 // std::vector<size_t> const& sample_indices
149 // ) const {
150 // std::vector<Variant> result;
151 // utils::InputStream it( source );
152 //
153 // // Convert the list of indices to a bool vec that tells which samples we want to process.
154 // auto const sample_filter = utils::make_bool_vector_from_indices( sample_indices );
155 //
156 // // Read, with correct order check, just in case.
157 // std::string cur_chr = "";
158 // size_t cur_pos = 0;
159 // Variant var;
160 // while( parse_line_( it, var, sample_filter, true )) {
161 // process_pileup_correct_input_order_check_( it, cur_chr, cur_pos, var.chromosome, var.position );
162 // result.push_back( var );
163 // }
164 // return result;
165 // }
166 
168  std::shared_ptr< utils::BaseInputSource > source,
169  std::vector<bool> const& sample_filter
170 ) const {
171  std::vector<Variant> result;
172  utils::InputStream it( source );
173 
174  // Read, with correct order check, just in case.
175  std::string cur_chr = "";
176  size_t cur_pos = 0;
177  Variant var;
178  while( parse_line_( it, var, sample_filter, true )) {
179  process_pileup_correct_input_order_check_( it, cur_chr, cur_pos, var.chromosome, var.position );
180  result.push_back( var );
181  }
182  return result;
183 }
184 
185 // =================================================================================================
186 // Parsing Records
187 // =================================================================================================
188 
190  utils::InputStream& input_stream,
192 ) const {
193  return parse_line_( input_stream, record, {}, false );
194 }
195 
197  utils::InputStream& input_stream,
199  std::vector<bool> const& sample_filter
200 ) const {
201  return parse_line_( input_stream, record, sample_filter, true );
202 }
203 
204 // =================================================================================================
205 // Parsing Variants
206 // =================================================================================================
207 
209  utils::InputStream& input_stream,
210  Variant& variant
211 ) const {
212  return parse_line_( input_stream, variant, {}, false );
213 }
214 
216  utils::InputStream& input_stream,
217  Variant& variant,
218  std::vector<bool> const& sample_filter
219 ) const {
220  return parse_line_( input_stream, variant, sample_filter, true );
221 }
222 
223 // =================================================================================================
224 // Internal Members
225 // =================================================================================================
226 
227 // -------------------------------------------------------------------------
228 // Parse Line
229 // -------------------------------------------------------------------------
230 
231 template<class T>
232 bool SimplePileupReader::parse_line_(
233  utils::InputStream& input_stream,
234  T& target,
235  std::vector<bool> const& sample_filter,
236  bool use_sample_filter
237 ) const {
238  // Shorthand.
239  auto& it = input_stream;
240 
241  // If we reached the end of the input stream, reset the target. We do not reset per default,
242  // in order to avoid costly re-initialization of the sample vector. But when we finish with
243  // an input stream, we want to reset, so that subsequent usage of this reader class does not
244  // fail if the pileup file contains a different number of samples.
245  // Still, the user will currently get an error when using the same reader instance to
246  // simultaneously (interlaced) read from multiple pileup files with differing number of samples
247  // into the same target... But who does that?! If you are a user having this issue, please
248  // let me know!
249  if( !it ) {
250  target = T();
251  return false;
252  }
253  assert( it );
254  if( *it == '\n' ) {
255  throw std::runtime_error(
256  "Malformed pileup " + it.source_name() + " at " + it.at() + ": Invalid empty line"
257  );
258  }
259 
260  // Read chromosome.
262  target.chromosome = utils::read_while( it, utils::is_graph );
263  if( target.chromosome.empty() ) {
264  throw std::runtime_error(
265  "Malformed pileup " + it.source_name() + " at " + it.at() +
266  ": empty chromosome name"
267  );
268  }
269  assert( !it || !utils::is_graph( *it ));
270 
271  // Read position.
272  next_field_( it );
273  target.position = utils::parse_unsigned_integer<size_t>( it );
274  if( target.position == 0 ) {
275  throw std::runtime_error(
276  "Malformed pileup " + it.source_name() + " at " + it.at() +
277  ": chromosome position == 0"
278  );
279  }
280  assert( !it || !utils::is_digit( *it ));
281 
282  // Read reference base. We also set the alternative base, just in case, to make sure that
283  // it has the value that we need it to have in absence of actual data.
284  next_field_( it );
285  auto rb = utils::to_upper( *it );
286  if( rb != 'A' && rb != 'C' && rb != 'G' && rb != 'T' && rb != 'N' ) {
287  if( strict_bases_ ) {
288  throw std::runtime_error(
289  "Malformed pileup " + it.source_name() + " at " + it.at() +
290  ": Invalid reference base that is not in [ACGTN]"
291  );
292  } else {
293  rb = 'N';
294  }
295  }
296  target.reference_base = rb;
297  set_target_alternative_base_( target );
298  ++it;
299 
300  // Read the samples. We switch once for the first line, and thereafter check that we read the
301  // same number of samples each time.
302  size_t src_index = 0;
303  if( target.samples.empty() ) {
304  while( it && *it != '\n' ) {
305  if(
306  ! use_sample_filter ||
307  ( src_index < sample_filter.size() && sample_filter[src_index] )
308  ) {
309  target.samples.emplace_back();
310  process_sample_( it, target, target.samples.back() );
311  } else {
312  skip_sample_( it );
313  }
314  ++src_index;
315  }
316  } else {
317  // Here we need two indices, one over the samples in the file (source),
318  // and one for the samples that we are writing in our Record (destination).
319  size_t dst_index = 0;
320  while( it && *it != '\n' ) {
321  if(
322  ! use_sample_filter ||
323  ( src_index < sample_filter.size() && sample_filter[src_index] )
324  ) {
325  if( dst_index >= target.samples.size() ) {
326  throw std::runtime_error(
327  "Malformed pileup " + it.source_name() + " at " + it.at() +
328  ": Line with different number of samples."
329  );
330  }
331  assert( dst_index < target.samples.size() );
332 
333  process_sample_( it, target, target.samples[dst_index] );
334  ++dst_index;
335  } else {
336  skip_sample_( it );
337  }
338  ++src_index;
339  }
340  if( dst_index != target.samples.size() ) {
341  throw std::runtime_error(
342  "Malformed pileup " + it.source_name() + " at " + it.at() +
343  ": Line with different number of samples."
344  );
345  }
346  }
347  if( use_sample_filter && src_index != sample_filter.size() ) {
348  throw std::runtime_error(
349  "Malformed pileup " + it.source_name() + " at " + it.at() +
350  ": Number of samples in the line does not match the number of filter entries."
351  );
352  }
353 
354  assert( !it || *it == '\n' );
355  ++it;
356  return true;
357 }
358 
359 // -------------------------------------------------------------------------
360 // Process Sample
361 // -------------------------------------------------------------------------
362 
363 template<class T, class S>
364 void SimplePileupReader::process_sample_(
365  utils::InputStream& input_stream,
366  T const& target,
367  S& sample
368 ) const {
369  // Shorthand.
370  auto& it = input_stream;
371 
372  // Reset the sample.
373  sample = S();
374  base_buffer_.clear();
375 
376  // Read the total read count / coverage.
377  next_field_( it );
378  auto const read_coverage = utils::parse_unsigned_integer<size_t>( it );
379  set_sample_read_coverage_( read_coverage, sample );
380  assert( !it || !utils::is_digit( *it ));
381 
382  // Read the nucleotides, skipping everything that we don't want. We need to store these
383  // in a string first, as we want to do quality checks. Bit unfortunate, and maybe there
384  // is a smart way to avoid this for cases without quality string (without code duplication).
385  // Goot enough for now though.
386  next_field_( it );
387  while( it && utils::is_graph( *it )) {
388  auto const c = *it;
389  switch( c ) {
390  case '+':
391  case '-': {
392  // A sequence matching `[+-][0-9]+[ACGTNacgtn]+` is an insertion or deletion.
393  // We skip/ignore those.
394 
395  // We first here allowed degenerated chars, and made a buffer for the codes
396  // that are allowed in indels - like this:
397  // static const std::string allowed_codes = sequence::nucleic_acid_codes_all_letters();
398 
399  // But later, we changed this to use the the proper pileup definition here,
400  // see http://www.htslib.org/doc/samtools-mpileup.html
401  static const std::string allowed_codes = "ACGTN*#";
402 
403  // First, we need to get how many chars there are in this indel.
404  ++it;
405  size_t const indel_cnt = utils::parse_unsigned_integer<size_t>( it );
406 
407  // Then, we skip that many chars, making sure that all is in order.
408  for( size_t i = 0; i < indel_cnt; ++i ) {
409  if( !it ) {
410  throw std::runtime_error(
411  "Malformed pileup " + it.source_name() + " at " + it.at() +
412  ": Line with missing indel characters."
413  );
414  }
415  if(
416  strict_bases_ &&
417  !std::strchr( allowed_codes.c_str(), utils::to_upper( *it ))
418  ) {
419  throw std::runtime_error(
420  "Malformed pileup " + it.source_name() + " at " + it.at() +
421  ": Line with invalid indel character " + utils::char_to_hex( *it )
422  );
423  }
424  ++it;
425  }
426  break;
427  }
428  case '^': {
429  // Caret marks the start of a read segment, followed by a char for the mapping
430  // quality. We skip both of these.
431  ++it;
432  if( !it ) {
433  throw std::runtime_error(
434  "Malformed pileup " + it.source_name() + " at " + it.at() +
435  ": Line with invalid start of read segment marker"
436  );
437  }
438  ++it;
439  break;
440  }
441  case '$': {
442  // Dollar marks the end of a read segment. Skip.
443  ++it;
444  break;
445  }
446  case '.': {
447  // pileup wants '.' to be the ref base in upper case...
448  base_buffer_ += utils::to_upper( target.reference_base );
449  ++it;
450  break;
451  }
452  case ',': {
453  // ...and ',' to be the ref base in lower case
454  base_buffer_ += utils::to_lower( target.reference_base );
455  ++it;
456  break;
457  }
458  default: {
459  // Everything else we simply add as-is.
460  base_buffer_ += c;
461  ++it;
462  break;
463  }
464  }
465  }
466  assert( !it || !utils::is_graph( *it ) );
467  set_sample_read_bases_( base_buffer_, sample );
468 
469  // Read coverage count error check. We here allow for the same weird special case of a deletion
470  // that does not count for the coverage, as described in convert_to_base_counts().
471  if(
472  base_buffer_.size() != read_coverage &&
473  !( read_coverage == 0 && base_buffer_.size() == 1 )
474  ) {
475  throw std::runtime_error(
476  "Malformed pileup " + it.source_name() + " at " + it.at() +
477  ": Given read count (" + std::to_string( read_coverage ) +
478  ") does not match the number of bases found in the sample (" +
479  std::to_string( base_buffer_.size() ) + ")."
480  );
481  }
482 
483  // Now read the quality codes, if present.
484  process_quality_string_( it, sample );
485 
486  // Also check if we want to read the ancestral base, if present.
487  process_ancestral_base_( it, sample );
488 
489  // Final file sanity checks.
490  if( it && !( utils::is_blank( *it ) || utils::is_newline( *it ))) {
491  throw std::runtime_error(
492  "Malformed pileup " + it.source_name() + " at " + it.at() +
493  ": Invalid characters."
494  );
495  }
496 }
497 
498 // -------------------------------------------------------------------------
499 // Helper for alternative base
500 // -------------------------------------------------------------------------
501 
502 template<>
503 void SimplePileupReader::set_target_alternative_base_<SimplePileupReader::Record>(
505 ) const {
506  // The pileup format does not have an alterantive base, so we do nothing here.
507  (void) target;
508 }
509 
510 template<>
511 void SimplePileupReader::set_target_alternative_base_<Variant>(
512  Variant& target
513 ) const {
514  // The format does not have an ancestral base,
515  // but we want to make sure that it is set to a defined value in the Variant.
516  target.alternative_base = 'N';
517 }
518 
519 // -------------------------------------------------------------------------
520 // Helper for read coverage
521 // -------------------------------------------------------------------------
522 
523 template<>
524 void SimplePileupReader::set_sample_read_coverage_<SimplePileupReader::Sample>(
525  size_t read_coverage,
527 ) const {
528  sample.read_coverage = read_coverage;
529 }
530 
531 template<>
532 void SimplePileupReader::set_sample_read_coverage_<BaseCounts>(
533  size_t read_coverage,
534  BaseCounts& sample
535 ) const {
536  // Variant BaseCounts don't use read coverage
537  (void) read_coverage;
538  (void) sample;
539 }
540 
541 // -------------------------------------------------------------------------
542 // Helper for read bases
543 // -------------------------------------------------------------------------
544 
545 template<>
546 void SimplePileupReader::set_sample_read_bases_<SimplePileupReader::Sample>(
547  std::string const& read_bases,
549 ) const {
550  sample.read_bases = read_bases;
551 }
552 
553 template<>
554 void SimplePileupReader::set_sample_read_bases_<BaseCounts>(
555  std::string const& read_bases,
556  BaseCounts& sample
557 ) const {
558  // Variant BaseCounts don't use read bases
559  (void) read_bases;
560  (void) sample;
561 }
562 
563 // -------------------------------------------------------------------------
564 // Helper for quality strings
565 // -------------------------------------------------------------------------
566 
567 template<>
568 void SimplePileupReader::process_quality_string_<SimplePileupReader::Sample>(
569  utils::InputStream& input_stream,
571 ) const {
572  // Shorthand.
573  auto& it = input_stream;
574 
575  // Now read the quality codes, if present.
576  if( with_quality_string_ ) {
577  next_field_( it );
578  sample.phred_scores.reserve( sample.read_coverage );
579  while( it && utils::is_graph( *it )) {
581  *it, quality_encoding_
582  ));
583  ++it;
584  }
585  assert( !it || !utils::is_graph( *it ) );
586 
587  // Version that processes the whole string at once. Not much time saved, as we need
588  // to allocate the string first. Maybe refine later for speed.
589  // std::string qual;
590  // qual.reserve( sample.read_coverage );
591  // while( it && utils::is_graph( *it )) {
592  // qual += *it;
593  // ++it;
594  // }
595  // sample.phred_scores = sequence::quality_decode_to_phred_score( qual, quality_encoding_ );
596 
597  if( sample.read_bases.size() != sample.phred_scores.size() ) {
598  throw std::runtime_error(
599  "Malformed pileup " + it.source_name() + " at " + it.at() +
600  ": Line contains " + std::to_string( sample.read_bases.size() ) + " bases, but " +
601  std::to_string( sample.phred_scores.size() ) + " quality score codes."
602  );
603  }
604  }
605  assert( sample.phred_scores.empty() || sample.read_bases.size() == sample.phred_scores.size() );
606  assert( !it || !utils::is_graph( *it ) );
607 }
608 
609 template<>
610 void SimplePileupReader::process_quality_string_<BaseCounts>(
611  utils::InputStream& input_stream,
612  BaseCounts& sample
613 ) const {
614  // Shorthand.
615  auto& it = input_stream;
616 
617  // Now read the quality codes, if present.
618  if( with_quality_string_ ) {
619  next_field_( it );
620 
621  // Go through the quality scores, and tally up the bases that have a high enough quality,
622  // keeping track of the position (pos) in the buffer.
623  size_t pos = 0;
624  while( it && utils::is_graph( *it )) {
625  if( pos >= base_buffer_.size() ) {
626  throw std::runtime_error(
627  "Malformed pileup " + it.source_name() + " at " + it.at() +
628  ": Line contains " + std::to_string( base_buffer_.size() ) + " bases, but " +
629  std::to_string( pos ) + " or more quality score codes."
630  );
631  }
632 
633  // Process the score, and tally up its base if the score is high enough.
634  auto const score = sequence::quality_decode_to_phred_score( *it, quality_encoding_ );
635  if( score >= min_base_quality_ ) {
636  tally_base_( it, sample, base_buffer_[pos] );
637  }
638 
639  ++pos;
640  ++it;
641  }
642  assert( !it || !utils::is_graph( *it ) );
643 
644  if( pos != base_buffer_.size() ) {
645  throw std::runtime_error(
646  "Malformed pileup " + it.source_name() + " at " + it.at() +
647  ": Line contains " + std::to_string( base_buffer_.size() ) + " bases, but " +
648  std::to_string( pos ) + " quality score codes."
649  );
650  }
651  } else {
652  // Without quality scores, simply tally up all the bases.
653  // This is the part that could be optimized by not storing the bases in a string first.
654  for( auto const c : base_buffer_ ) {
655  tally_base_( it, sample, c );
656  }
657  }
658  assert( !it || !utils::is_graph( *it ) );
659 }
660 
661 void SimplePileupReader::tally_base_(
662  utils::InputStream& input_stream,
663  BaseCounts& base_count,
664  char b
665 ) const {
666  switch( b ) {
667  case 'a':
668  case 'A': {
669  ++base_count.a_count;
670  break;
671  }
672  case 'c':
673  case 'C': {
674  ++base_count.c_count;
675  break;
676  }
677  case 'g':
678  case 'G': {
679  ++base_count.g_count;
680  break;
681  }
682  case 't':
683  case 'T': {
684  ++base_count.t_count;
685  break;
686  }
687  case 'n':
688  case 'N': {
689  ++base_count.n_count;
690  break;
691  }
692  case '*':
693  case '#': {
694  ++base_count.d_count;
695  break;
696  }
697  case '<':
698  case '>': {
699  break;
700  }
701  default: {
702  throw std::runtime_error(
703  "Malformed pileup " + input_stream.source_name() + " at " + input_stream.at() +
704  ": Invalid allele character " + utils::char_to_hex( b )
705  );
706  }
707  }
708 }
709 
710 // -------------------------------------------------------------------------
711 // Helper for ancestral base
712 // -------------------------------------------------------------------------
713 
714 template<>
715 void SimplePileupReader::process_ancestral_base_<SimplePileupReader::Sample>(
716  utils::InputStream& input_stream,
718 ) const {
719  // Shorthand.
720  auto& it = input_stream;
721 
722  if( with_ancestral_base_ ) {
723  next_field_( it );
724  // We can simply read in the char here. Even if the iterator is at its end, it will
725  // simply return a null char, which will trigger the subsequent error check.
726  char ab = utils::to_upper( *it );
727  if( !it || ( ab != 'A' && ab != 'C' && ab != 'G' && ab != 'T' && ab != 'N' )) {
728  if( strict_bases_ ) {
729  throw std::runtime_error(
730  "Malformed pileup " + it.source_name() + " at " + it.at() +
731  ": Expecting ancestral base character in [ACGTN]."
732  );
733  } else {
734  ab = 'N';
735  }
736  }
737  sample.ancestral_base = ab;
738  ++it;
739  }
740 }
741 
742 template<>
743 void SimplePileupReader::process_ancestral_base_<BaseCounts>(
744  utils::InputStream& input_stream,
745  BaseCounts& sample
746 ) const {
747  (void) input_stream;
748  (void) sample;
749 
750  // Also check if we want to read the ancestral base, if present.
751  if( with_ancestral_base_ ) {
752  // throw std::runtime_error(
753  // "SimplePileupReader currently does not implement to read (m)pileup files "
754  // "with ancestral bases when reading Variants."
755  // );
756 
757  // Let's simply read and ignore the ancestral base, as our Variant/BaseCounts setup
758  // does not store those at the moment.
759  // For simplicity and to avoid code duplication, we just call the other version of this
760  // function with a dummy Sample. This is not super efficient, but given how rare pileups
761  // with ancestral base are, this is totally fine for now. Can optimize later.
762  SimplePileupReader::Sample dummy;
763  process_ancestral_base_( input_stream, dummy );
764  }
765 }
766 
767 // -------------------------------------------------------------------------
768 // Skip Sample
769 // -------------------------------------------------------------------------
770 
771 void SimplePileupReader::skip_sample_(
772  utils::InputStream& input_stream
773 ) const {
774  // Shorthand.
775  auto& it = input_stream;
776 
777  // Read the total read count / coverage.
778  next_field_( it );
780  assert( !it || !utils::is_digit( *it ));
781 
782  // Read the nucleotides.
783  next_field_( it );
785  assert( !it || !utils::is_graph( *it ) );
786 
787  // Read the quality codes, if present.
788  if( with_quality_string_ ) {
789  next_field_( it );
791  }
792  assert( !it || !utils::is_graph( *it ) );
793 
794  // Read the ancestral base, if present.
795  if( with_ancestral_base_ ) {
796  next_field_( it );
798  }
799  assert( !it || !utils::is_graph( *it ) );
800 
801  // Final file sanity checks.
802  if( it && !( utils::is_blank( *it ) || utils::is_newline( *it ))) {
803  throw std::runtime_error(
804  "Malformed pileup " + it.source_name() + " at " + it.at() +
805  ": Invalid characters."
806  );
807  }
808 }
809 
810 // -------------------------------------------------------------------------
811 // Next Field
812 // -------------------------------------------------------------------------
813 
814 void SimplePileupReader::next_field_( utils::InputStream& input_stream ) const
815 {
816  // There needs to be at last some whitespace that separates the field. Affirm that,
817  // then skip it until we are at the content of the next field.
818  // utils::affirm_char_or_throw( input_stream, utils::is_blank );
819  // utils::skip_while( input_stream, utils::is_blank );
820  // assert( !input_stream || !utils::is_blank( *input_stream ));
821 
822  // Nope, the above skips empty fields, which can occurr when there are no bases
823  // at a position at all. Let's just follow the standard more strictly, and check for a tab.
824  utils::affirm_char_or_throw( input_stream, '\t' );
825  ++input_stream;
826 }
827 
828 } // namespace population
829 } // namespace genesis
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:481
genesis::sequence::quality_decode_to_phred_score
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score.
Definition: quality.cpp:218
genesis::utils::is_graph
constexpr bool is_graph(char c) noexcept
Return whether a char is a character with graphical representation, according to isgraph of the cctyp...
Definition: char.hpp:166
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::population::SimplePileupReader::parse_line_variant
bool parse_line_variant(utils::InputStream &input_stream, Variant &variant) const
Read an (m)pileup line, as a Variant.
Definition: simple_pileup_reader.cpp:208
parser.hpp
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:522
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
helper.hpp
genesis::population::SimplePileupReader::Record::position
size_t position
Definition: simple_pileup_reader.hpp:154
genesis::population::Variant::position
size_t position
Definition: variant.hpp:65
genesis::population::SimplePileupReader::read_variants
std::vector< Variant > read_variants(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as Variants.
Definition: simple_pileup_reader.cpp:129
genesis::population::SimplePileupReader::Sample::read_coverage
size_t read_coverage
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:116
genesis::utils::affirm_char_or_throw
void affirm_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that checks whether the current char from the stream equals the provided one.
Definition: scanner.hpp:385
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:102
genesis::population::process_pileup_correct_input_order_check_
void process_pileup_correct_input_order_check_(utils::InputStream const &it, std::string &cur_chr, size_t &cur_pos, std::string const &new_chr, size_t new_pos)
Local helper function to remove code duplication for the correct input order check.
Definition: simple_pileup_reader.cpp:54
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:151
genesis::utils::is_newline
constexpr bool is_newline(char c) noexcept
Return whether a char is either a new line or a carriage return character.
Definition: char.hpp:182
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
simple_pileup_reader.hpp
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:125
char.hpp
genesis::population::SimplePileupReader::Record::chromosome
std::string chromosome
Definition: simple_pileup_reader.hpp:153
genesis::utils::is_blank
constexpr bool is_blank(char c) noexcept
Return whether a char is either a space or a tab character.
Definition: char.hpp:174
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::SimplePileupReader::parse_line_record
bool parse_line_record(utils::InputStream &input_stream, Record &record) const
Read an (m)pileup line, as a Record.
Definition: simple_pileup_reader.cpp:189
genesis::population::SimplePileupReader::Sample::ancestral_base
char ancestral_base
Base of the ancestral allele.
Definition: simple_pileup_reader.hpp:141
scanner.hpp
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:221
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:134
genesis::population::SimplePileupReader::read_records
std::vector< Record > read_records(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as pileup Records.
Definition: simple_pileup_reader.cpp:69
codes.hpp
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:64