45 namespace population {
56 std::string& cur_chr,
size_t& cur_pos,
57 std::string
const& new_chr,
size_t new_pos
59 if( new_chr < cur_chr || ( new_chr == cur_chr && new_pos <= cur_pos )) {
60 throw std::runtime_error(
62 ": unordered chromosomes and positions"
70 std::shared_ptr< utils::BaseInputSource > source
72 std::vector<SimplePileupReader::Record> result;
76 std::string cur_chr =
"";
79 while( parse_line_( it, rec, {}, false )) {
81 result.push_back( rec );
108 std::shared_ptr< utils::BaseInputSource > source,
109 std::vector<bool>
const& sample_filter
111 std::vector<SimplePileupReader::Record> result;
115 std::string cur_chr =
"";
118 while( parse_line_( it, rec, sample_filter,
true )) {
120 result.push_back( rec );
130 std::shared_ptr< utils::BaseInputSource > source
132 std::vector<Variant> result;
136 std::string cur_chr =
"";
139 while( parse_line_( it, var, {}, false )) {
141 result.push_back( var );
168 std::shared_ptr< utils::BaseInputSource > source,
169 std::vector<bool>
const& sample_filter
171 std::vector<Variant> result;
175 std::string cur_chr =
"";
178 while( parse_line_( it, var, sample_filter,
true )) {
180 result.push_back( var );
193 return parse_line_( input_stream, record, {}, false );
199 std::vector<bool>
const& sample_filter
201 return parse_line_( input_stream, record, sample_filter,
true );
212 return parse_line_( input_stream, variant, {}, false );
218 std::vector<bool>
const& sample_filter
220 return parse_line_( input_stream, variant, sample_filter,
true );
232 bool SimplePileupReader::parse_line_(
235 std::vector<bool>
const& sample_filter,
236 bool use_sample_filter
239 auto& it = input_stream;
255 throw std::runtime_error(
256 "Malformed pileup " + it.source_name() +
" at " + it.at() +
": Invalid empty line"
263 if( target.chromosome.empty() ) {
264 throw std::runtime_error(
265 "Malformed pileup " + it.source_name() +
" at " + it.at() +
266 ": empty chromosome name"
273 target.position = utils::parse_unsigned_integer<size_t>( it );
274 if( target.position == 0 ) {
275 throw std::runtime_error(
276 "Malformed pileup " + it.source_name() +
" at " + it.at() +
277 ": chromosome position == 0"
286 if( rb !=
'A' && rb !=
'C' && rb !=
'G' && rb !=
'T' && rb !=
'N' ) {
287 if( strict_bases_ ) {
288 throw std::runtime_error(
289 "Malformed pileup " + it.source_name() +
" at " + it.at() +
290 ": Invalid reference base that is not in [ACGTN]"
296 target.reference_base = rb;
297 set_target_alternative_base_( target );
302 size_t src_index = 0;
303 if( target.samples.empty() ) {
304 while( it && *it !=
'\n' ) {
306 ! use_sample_filter ||
307 ( src_index < sample_filter.size() && sample_filter[src_index] )
309 target.samples.emplace_back();
310 process_sample_( it, target, target.samples.back() );
319 size_t dst_index = 0;
320 while( it && *it !=
'\n' ) {
322 ! use_sample_filter ||
323 ( src_index < sample_filter.size() && sample_filter[src_index] )
325 if( dst_index >= target.samples.size() ) {
326 throw std::runtime_error(
327 "Malformed pileup " + it.source_name() +
" at " + it.at() +
328 ": Line with different number of samples."
331 assert( dst_index < target.samples.size() );
333 process_sample_( it, target, target.samples[dst_index] );
340 if( dst_index != target.samples.size() ) {
341 throw std::runtime_error(
342 "Malformed pileup " + it.source_name() +
" at " + it.at() +
343 ": Line with different number of samples."
347 if( use_sample_filter && src_index != sample_filter.size() ) {
348 throw std::runtime_error(
349 "Malformed pileup " + it.source_name() +
" at " + it.at() +
350 ": Number of samples in the line does not match the number of filter entries."
354 assert( !it || *it ==
'\n' );
363 template<
class T,
class S>
364 void SimplePileupReader::process_sample_(
365 utils::InputStream& input_stream,
370 auto& it = input_stream;
374 base_buffer_.clear();
378 auto const read_coverage = utils::parse_unsigned_integer<size_t>( it );
379 set_sample_read_coverage_( read_coverage, sample );
401 static const std::string allowed_codes =
"ACGTN*#";
405 size_t const indel_cnt = utils::parse_unsigned_integer<size_t>( it );
408 for(
size_t i = 0; i < indel_cnt; ++i ) {
410 throw std::runtime_error(
411 "Malformed pileup " + it.source_name() +
" at " + it.at() +
412 ": Line with missing indel characters."
419 throw std::runtime_error(
420 "Malformed pileup " + it.source_name() +
" at " + it.at() +
433 throw std::runtime_error(
434 "Malformed pileup " + it.source_name() +
" at " + it.at() +
435 ": Line with invalid start of read segment marker"
467 set_sample_read_bases_( base_buffer_, sample );
472 base_buffer_.size() != read_coverage &&
473 !( read_coverage == 0 && base_buffer_.size() == 1 )
475 throw std::runtime_error(
476 "Malformed pileup " + it.source_name() +
" at " + it.at() +
478 ") does not match the number of bases found in the sample (" +
484 process_quality_string_( it, sample );
487 process_ancestral_base_( it, sample );
491 throw std::runtime_error(
492 "Malformed pileup " + it.source_name() +
" at " + it.at() +
493 ": Invalid characters."
503 void SimplePileupReader::set_target_alternative_base_<SimplePileupReader::Record>(
511 void SimplePileupReader::set_target_alternative_base_<Variant>(
516 target.alternative_base =
'N';
524 void SimplePileupReader::set_sample_read_coverage_<SimplePileupReader::Sample>(
525 size_t read_coverage,
532 void SimplePileupReader::set_sample_read_coverage_<BaseCounts>(
533 size_t read_coverage,
537 (void) read_coverage;
546 void SimplePileupReader::set_sample_read_bases_<SimplePileupReader::Sample>(
547 std::string
const& read_bases,
554 void SimplePileupReader::set_sample_read_bases_<BaseCounts>(
555 std::string
const& read_bases,
568 void SimplePileupReader::process_quality_string_<SimplePileupReader::Sample>(
573 auto& it = input_stream;
576 if( with_quality_string_ ) {
581 *it, quality_encoding_
598 throw std::runtime_error(
599 "Malformed pileup " + it.source_name() +
" at " + it.at() +
610 void SimplePileupReader::process_quality_string_<BaseCounts>(
615 auto& it = input_stream;
618 if( with_quality_string_ ) {
625 if( pos >= base_buffer_.size() ) {
626 throw std::runtime_error(
627 "Malformed pileup " + it.source_name() +
" at " + it.at() +
628 ": Line contains " +
std::to_string( base_buffer_.size() ) +
" bases, but " +
635 if( score >= min_base_quality_ ) {
636 tally_base_( it, sample, base_buffer_[pos] );
644 if( pos != base_buffer_.size() ) {
645 throw std::runtime_error(
646 "Malformed pileup " + it.source_name() +
" at " + it.at() +
647 ": Line contains " +
std::to_string( base_buffer_.size() ) +
" bases, but " +
654 for(
auto const c : base_buffer_ ) {
655 tally_base_( it, sample, c );
661 void SimplePileupReader::tally_base_(
662 utils::InputStream& input_stream,
663 BaseCounts& base_count,
669 ++base_count.a_count;
674 ++base_count.c_count;
679 ++base_count.g_count;
684 ++base_count.t_count;
689 ++base_count.n_count;
694 ++base_count.d_count;
702 throw std::runtime_error(
703 "Malformed pileup " + input_stream.source_name() +
" at " + input_stream.at() +
715 void SimplePileupReader::process_ancestral_base_<SimplePileupReader::Sample>(
720 auto& it = input_stream;
722 if( with_ancestral_base_ ) {
727 if( !it || ( ab !=
'A' && ab !=
'C' && ab !=
'G' && ab !=
'T' && ab !=
'N' )) {
728 if( strict_bases_ ) {
729 throw std::runtime_error(
730 "Malformed pileup " + it.source_name() +
" at " + it.at() +
731 ": Expecting ancestral base character in [ACGTN]."
743 void SimplePileupReader::process_ancestral_base_<BaseCounts>(
751 if( with_ancestral_base_ ) {
762 SimplePileupReader::Sample dummy;
763 process_ancestral_base_( input_stream, dummy );
771 void SimplePileupReader::skip_sample_(
772 utils::InputStream& input_stream
775 auto& it = input_stream;
788 if( with_quality_string_ ) {
795 if( with_ancestral_base_ ) {
803 throw std::runtime_error(
804 "Malformed pileup " + it.source_name() +
" at " + it.at() +
805 ": Invalid characters."
814 void SimplePileupReader::next_field_( utils::InputStream& input_stream )
const