46 namespace population {
53 std::shared_ptr< utils::BaseInputSource > source
55 std::vector<SimplePileupReader::Record> result;
59 quality_code_counts_ = std::array<size_t, 128>{};
64 while( parse_line_( it, rec, {}, false )) {
65 result.push_back( rec );
88 std::shared_ptr< utils::BaseInputSource > source,
89 std::vector<bool>
const& sample_filter
91 std::vector<SimplePileupReader::Record> result;
95 quality_code_counts_ = std::array<size_t, 128>{};
100 while( parse_line_( it, rec, sample_filter,
true )) {
101 result.push_back( rec );
111 std::shared_ptr< utils::BaseInputSource > source
113 std::vector<Variant> result;
119 while( parse_line_( it, var, {}, false )) {
120 result.push_back( var );
143 std::shared_ptr< utils::BaseInputSource > source,
144 std::vector<bool>
const& sample_filter
146 std::vector<Variant> result;
152 while( parse_line_( it, var, sample_filter,
true )) {
153 result.push_back( var );
166 return parse_line_( input_stream, record, {}, false );
172 std::vector<bool>
const& sample_filter
174 return parse_line_( input_stream, record, sample_filter,
true );
185 reset_status_( variant );
186 return parse_line_( input_stream, variant, {}, false );
192 std::vector<bool>
const& sample_filter
194 reset_status_( variant );
195 return parse_line_( input_stream, variant, sample_filter,
true );
206 void SimplePileupReader::reset_status_(
Variant& variant )
const
209 for(
auto& sample : variant.
samples ) {
210 sample.status.reset();
219 bool SimplePileupReader::parse_line_(
220 utils::InputStream& input_stream,
222 std::vector<bool>
const& sample_filter,
223 bool use_sample_filter
226 auto& it = input_stream;
242 throw std::runtime_error(
243 "Malformed pileup " + it.source_name() +
" at " + it.at() +
": Invalid empty line"
250 if( target.chromosome.empty() ) {
251 throw std::runtime_error(
252 "Malformed pileup " + it.source_name() +
" at " + it.at() +
253 ": empty chromosome name"
260 target.position = utils::parse_unsigned_integer<size_t>( it );
261 if( target.position == 0 ) {
262 throw std::runtime_error(
263 "Malformed pileup " + it.source_name() +
" at " + it.at() +
264 ": chromosome position == 0, while pileup demands 1-based positions"
274 if( strict_bases_ ) {
275 throw std::runtime_error(
276 "Malformed pileup " + it.source_name() +
" at " + it.at() +
277 ": Invalid reference base that is not in [ACGTN]"
283 target.reference_base = rb;
284 set_target_alternative_base_( target );
289 size_t src_index = 0;
290 if( target.samples.empty() ) {
291 while( it && *it !=
'\n' ) {
293 ! use_sample_filter ||
294 ( src_index < sample_filter.size() && sample_filter[src_index] )
296 target.samples.emplace_back();
297 process_sample_( it, target, target.samples.back() );
306 size_t dst_index = 0;
307 while( it && *it !=
'\n' ) {
309 ! use_sample_filter ||
310 ( src_index < sample_filter.size() && sample_filter[src_index] )
312 if( dst_index >= target.samples.size() ) {
313 throw std::runtime_error(
314 "Malformed pileup " + it.source_name() +
" at " + it.at() +
315 ": Line with different number of samples."
318 assert( dst_index < target.samples.size() );
320 process_sample_( it, target, target.samples[dst_index] );
327 if( dst_index != target.samples.size() ) {
328 throw std::runtime_error(
329 "Malformed pileup " + it.source_name() +
" at " + it.at() +
330 ": Line with different number of samples."
334 if( use_sample_filter && src_index != sample_filter.size() ) {
335 throw std::runtime_error(
336 "Malformed pileup " + it.source_name() +
" at " + it.at() +
337 ": Number of samples in the line does not match the number of filter entries."
341 assert( !it || *it ==
'\n' );
350 template<
class T,
class S>
351 void SimplePileupReader::process_sample_(
352 utils::InputStream& input_stream,
357 auto& it = input_stream;
364 auto const read_depth = utils::parse_unsigned_integer<size_t>( it );
365 set_sample_read_depth_( read_depth, sample );
376 auto const done_reading_bases = process_sample_read_bases_buffer_( it, target.reference_base );
377 if( ! done_reading_bases ) {
379 process_sample_read_bases_stream_( it, target.reference_base );
381 set_sample_read_bases_( base_buffer_, sample );
386 base_buffer_.size() != read_depth &&
387 !( read_depth == 0 && base_buffer_.size() == 1 )
389 throw std::runtime_error(
390 "Malformed pileup " + it.source_name() +
" at " + it.at() +
392 ") does not match the number of bases found in the sample (" +
398 process_quality_string_( it, sample );
401 process_ancestral_base_( it, sample );
405 throw std::runtime_error(
406 "Malformed pileup " + it.source_name() +
" at " + it.at() +
407 ": Invalid characters."
417 void SimplePileupReader::set_target_alternative_base_<SimplePileupReader::Record>(
425 void SimplePileupReader::set_target_alternative_base_<Variant>(
430 target.alternative_base =
'N';
438 void SimplePileupReader::set_sample_read_depth_<SimplePileupReader::Sample>(
446 void SimplePileupReader::set_sample_read_depth_<SampleCounts>(
459 bool SimplePileupReader::process_sample_read_bases_buffer_(
460 utils::InputStream& input_stream,
464 auto& it = input_stream;
465 auto const in_buff = input_stream.buffer();
466 base_buffer_.clear();
475 while( pos < in_buff.second ) {
485 switch( in_buff.first[pos] ) {
491 static const std::string allowed_codes =
"ACGTN*#";
495 if( pos >= in_buff.second ) {
504 auto const indel_cnt = std::strtoul( &in_buff.first[pos], &endptr, 10 );
505 if( errno == ERANGE || &in_buff.first[pos] == endptr ) {
506 throw std::runtime_error(
507 "Malformed pileup " + it.source_name() +
" near " + it.at() +
508 ": Line with invalid indel characters count that is not a valid number."
513 pos += endptr - &in_buff.first[pos];
514 if( pos >= in_buff.second ) {
521 for(
size_t i = 0; i < indel_cnt; ++i ) {
522 if( pos >= in_buff.second ) {
529 !std::strchr( allowed_codes.c_str(),
utils::to_upper( in_buff.first[pos]))
531 throw std::runtime_error(
532 "Malformed pileup " + it.source_name() +
" near " + it.at() +
533 ": Line with invalid indel character " +
545 if( pos >= in_buff.second ) {
560 base_buffer_ += u_ref_base;
566 base_buffer_ += l_ref_base;
572 base_buffer_ += in_buff.first[pos];
578 assert( pos == in_buff.second || !
utils::is_graph( in_buff.first[pos] ) );
584 if( pos < in_buff.second ) {
587 it.jump_unchecked( pos );
597 void SimplePileupReader::process_sample_read_bases_stream_(
598 utils::InputStream& input_stream,
602 auto& it = input_stream;
603 base_buffer_.clear();
619 static const std::string allowed_codes =
"ACGTN*#";
623 size_t const indel_cnt = utils::parse_unsigned_integer<size_t>( it );
626 for(
size_t i = 0; i < indel_cnt; ++i ) {
628 throw std::runtime_error(
629 "Malformed pileup " + it.source_name() +
" at " + it.at() +
630 ": Line with missing indel characters."
637 throw std::runtime_error(
638 "Malformed pileup " + it.source_name() +
" at " + it.at() +
651 throw std::runtime_error(
652 "Malformed pileup " + it.source_name() +
" at " + it.at() +
653 ": Line with invalid start of read segment marker"
692 void SimplePileupReader::set_sample_read_bases_<SimplePileupReader::Sample>(
693 std::string
const& read_bases,
700 void SimplePileupReader::set_sample_read_bases_<SampleCounts>(
701 std::string
const& read_bases,
714 void SimplePileupReader::process_quality_string_<SimplePileupReader::Sample>(
719 auto& it = input_stream;
722 if( with_quality_string_ ) {
726 ++quality_code_counts_[*it];
728 *it, quality_encoding_
745 throw std::runtime_error(
746 "Malformed pileup " + it.source_name() +
" at " + it.at() +
757 void SimplePileupReader::process_quality_string_<SampleCounts>(
762 auto& it = input_stream;
765 if( with_quality_string_ ) {
771 auto const in_buff = it.buffer();
776 sample.a_count == 0 && sample.c_count == 0 && sample.g_count == 0 &&
777 sample.t_count == 0 && sample.n_count == 0 && sample.d_count == 0
783 for( ; pos < in_buff.second; ++pos ) {
790 if( pos >= base_buffer_.size() ) {
791 throw std::runtime_error(
792 "Malformed pileup " + it.source_name() +
" at " + it.at() +
793 ": Line contains " +
std::to_string( base_buffer_.size() ) +
" bases, but " +
799 if( min_base_quality_ > 0 ) {
801 in_buff.first[pos], quality_encoding_
803 if( score >= min_base_quality_ ) {
804 tally_base_( it, sample, base_buffer_[pos] );
809 tally_base_( it, sample, base_buffer_[pos] );
812 assert( pos == in_buff.second || !
utils::is_graph( in_buff.first[pos] ) );
818 if( pos < in_buff.second ) {
821 it.jump_unchecked( pos );
825 sample = SampleCounts();
830 if( pos >= base_buffer_.size() ) {
831 throw std::runtime_error(
832 "Malformed pileup " + it.source_name() +
" at " + it.at() +
833 ": Line contains " +
std::to_string( base_buffer_.size() ) +
" bases, but " +
840 if( score >= min_base_quality_ ) {
841 tally_base_( it, sample, base_buffer_[pos] );
851 if( pos != base_buffer_.size() ) {
852 throw std::runtime_error(
853 "Malformed pileup " + it.source_name() +
" at " + it.at() +
854 ": Line contains " +
std::to_string( base_buffer_.size() ) +
" bases, but " +
861 for(
auto const c : base_buffer_ ) {
862 tally_base_( it, sample, c );
868 inline void SimplePileupReader::tally_base_(
869 utils::InputStream& input_stream,
870 SampleCounts& sample,
909 throw std::runtime_error(
910 "Malformed pileup " + input_stream.source_name() +
" near " + input_stream.at() +
922 void SimplePileupReader::process_ancestral_base_<SimplePileupReader::Sample>(
927 auto& it = input_stream;
929 if( with_ancestral_base_ ) {
935 if( strict_bases_ ) {
936 throw std::runtime_error(
937 "Malformed pileup " + it.source_name() +
" at " + it.at() +
938 ": Expecting ancestral base character in [ACGTN]."
950 void SimplePileupReader::process_ancestral_base_<SampleCounts>(
958 if( with_ancestral_base_ ) {
969 SimplePileupReader::Sample dummy;
970 process_ancestral_base_( input_stream, dummy );
978 void SimplePileupReader::skip_sample_(
979 utils::InputStream& input_stream
982 auto& it = input_stream;
995 if( with_quality_string_ ) {
1002 if( with_ancestral_base_ ) {
1010 throw std::runtime_error(
1011 "Malformed pileup " + it.source_name() +
" at " + it.at() +
1012 ": Invalid characters."
1021 void SimplePileupReader::next_field_( utils::InputStream& input_stream )
const