46 namespace population {
63 iterator_ = parent_->input_.begin();
67 auto const sample_name_count = parent_->input_.data().sample_names.size();
69 check_input_iterator_();
70 num_samples_ = iterator_->samples.size();
73 if( sample_name_count > 0 && iterator_->samples.size() != sample_name_count ) {
74 throw std::runtime_error(
75 "Input source for VariantGaplessInputStream contains " +
76 std::to_string( iterator_->samples.size() ) +
" samples, but its sample " +
77 "name list contains " +
std::to_string( sample_name_count ) +
" names."
82 current_locus_ = GenomeLocus( iterator_->chromosome, 1 );
88 num_samples_ = sample_name_count;
92 auto const chr = find_next_extra_chromosome_();
97 current_locus_ = GenomeLocus( chr, 1 );
102 assert( current_locus_.chromosome !=
"" && current_locus_.position != 0 );
110 if( current_locus_is_covered_by_genome_locus_set_() ) {
111 prepare_current_variant_();
126 void VariantGaplessInputStream::Iterator::advance_()
139 advance_current_locus_();
142 if( current_locus_.empty() ) {
146 assert( current_locus_.chromosome !=
"" && current_locus_.position != 0 );
150 if( current_locus_.position == 1 ) {
153 }
while( ! current_locus_is_covered_by_genome_locus_set_() );
157 prepare_current_variant_();
164 void VariantGaplessInputStream::Iterator::init_chromosome_()
169 assert( !( parent_->ref_genome_ && parent_->seq_dict_ ));
172 assert( current_locus_.chromosome !=
"" );
173 assert( current_locus_.position == 1 );
174 std::string
const& chr = current_locus_.chromosome;
177 if( processed_chromosomes_.count( chr ) > 0 ) {
178 throw std::runtime_error(
179 "In VariantGaplessInputStream: Chromosome \"" + chr +
"\" occurs multiple times. "
180 "Likely, this means that the input is not sorted by chromosome and position."
183 processed_chromosomes_.insert( chr );
186 if( parent_->ref_genome_ ) {
187 ref_genome_it_ = parent_->ref_genome_->find( chr );
188 if( ref_genome_it_ == parent_->ref_genome_->end() ) {
189 throw std::runtime_error(
190 "In VariantGaplessInputStream: Chromosome \"" + chr +
"\" requested "
191 "in the input data, which does not occur in the reference genome."
197 if( parent_->seq_dict_ ) {
198 seq_dict_it_ = parent_->seq_dict_->find( chr );
199 if( seq_dict_it_ == parent_->seq_dict_->end() ) {
200 throw std::runtime_error(
201 "In VariantGaplessInputStream: Chromosome \"" + chr +
"\" requested "
202 "in the input data, which does not occur in the sequence dictionary."
212 if( parent_->genome_locus_set_ ) {
213 genome_locus_set_it_ = parent_->genome_locus_set_->find( chr );
221 void VariantGaplessInputStream::Iterator::advance_current_locus_()
226 advance_current_locus_beyond_input_();
235 if( iterator_->chromosome == current_locus_.chromosome ) {
236 assert( iterator_->position >= current_locus_.position );
237 if( iterator_->position == current_locus_.position ) {
240 advance_current_locus_beyond_input_();
243 check_input_iterator_();
254 iterator_->chromosome == current_locus_.chromosome ||
255 has_more_ref_loci_on_current_chromosome_()
257 ++current_locus_.position;
259 current_locus_ = GenomeLocus( iterator_->chromosome, 1 );
267 void VariantGaplessInputStream::Iterator::advance_current_locus_beyond_input_()
272 assert( !iterator_ );
276 if( has_more_ref_loci_on_current_chromosome_() ) {
277 ++current_locus_.position;
285 if( !parent_->iterate_extra_chromosomes_ ) {
286 current_locus_.clear();
292 auto const next_chr = find_next_extra_chromosome_();
293 if( next_chr.empty() ) {
294 current_locus_.clear();
296 current_locus_.chromosome = next_chr;
297 current_locus_.position = 1;
305 bool VariantGaplessInputStream::Iterator::has_more_ref_loci_on_current_chromosome_()
308 assert( !( parent_->ref_genome_ && parent_->seq_dict_ ));
313 if( parent_->ref_genome_ ) {
314 assert( ref_genome_it_ != parent_->ref_genome_->end() );
315 assert( ref_genome_it_->label() == current_locus_.chromosome );
316 if( current_locus_.position + 1 <= ref_genome_it_->size() ) {
320 if( parent_->seq_dict_ ) {
321 assert( seq_dict_it_ != parent_->seq_dict_->end() );
322 assert( seq_dict_it_->name == current_locus_.chromosome );
323 if( current_locus_.position + 1 <= seq_dict_it_->length ) {
334 std::string VariantGaplessInputStream::Iterator::find_next_extra_chromosome_()
339 if( !parent_->iterate_extra_chromosomes_ ) {
344 if( parent_->ref_genome_ ) {
351 if( processed_chromosomes_.size() == parent_->ref_genome_->size() ) {
356 for(
auto const& ref_chr : *parent_->ref_genome_ ) {
357 if( ref_chr.label().empty() ) {
358 throw std::runtime_error(
359 "Invalid empty chromosome name in reference genome."
362 if( processed_chromosomes_.count( ref_chr.label() ) == 0 ) {
363 return ref_chr.label();
373 if( parent_->seq_dict_ ) {
375 if( processed_chromosomes_.size() == parent_->seq_dict_->size() ) {
380 for(
auto const& entry : *parent_->seq_dict_ ) {
381 if( entry.name.empty() ) {
382 throw std::runtime_error(
383 "Invalid empty chromosome name in sequence dictionary."
386 if( processed_chromosomes_.count( entry.name ) == 0 ) {
404 void VariantGaplessInputStream::Iterator::prepare_current_variant_()
408 assert( current_locus_.chromosome !=
"" && current_locus_.position != 0 );
411 if( parent_->ref_genome_ ) {
412 assert( ref_genome_it_ != parent_->ref_genome_->end() );
413 assert( ref_genome_it_->label() == current_locus_.chromosome );
416 if( current_locus_.position > ref_genome_it_->length() ) {
417 throw std::runtime_error(
418 "In VariantGaplessInputStream: Invalid input data, which has data "
419 "beyond the reference genome at " +
to_string( current_locus_ )
423 if( parent_->seq_dict_ ) {
424 assert( seq_dict_it_ != parent_->seq_dict_->end() );
425 assert( seq_dict_it_->name == current_locus_.chromosome );
426 if( current_locus_.position > seq_dict_it_->length ) {
427 throw std::runtime_error(
428 "In VariantGaplessInputStream: Invalid input data, which has data "
429 "beyond the reference genome at " +
to_string( current_locus_ )
436 if( iterator_ &&
locus_equal( iterator_->chromosome, iterator_->position, current_locus_ )) {
437 current_variant_is_missing_ =
false;
440 if( iterator_->samples.size() != num_samples_ ) {
441 throw std::runtime_error(
442 "In VariantGaplessInputStream: Invalid input data that has an inconsistent "
443 "number of samples throughout, first occurring at " +
to_string( current_locus_ ) +
445 " samples based on first iteration, but found " +
450 current_variant_is_missing_ =
true;
451 missing_variant_.chromosome = current_locus_.chromosome;
452 missing_variant_.position = current_locus_.position;
453 missing_variant_.status.reset( VariantFilterTag::kMissing );
454 missing_variant_.reference_base =
'N';
455 missing_variant_.alternative_base =
'N';
459 missing_variant_.samples.resize( num_samples_ );
460 for(
auto& sample : missing_variant_.samples ) {
461 sample = SampleCounts();
462 sample.status.reset( SampleCountsFilterTag::kMissing );
466 prepare_current_variant_ref_base_();
473 void VariantGaplessInputStream::Iterator::prepare_current_variant_ref_base_()
476 auto& cur_var = current_variant_();
481 assert( !current_locus_.chromosome.empty() && current_locus_.position > 0 );
483 cur_var.chromosome, cur_var.position, current_locus_
488 if( !parent_->ref_genome_ ) {
491 assert( ref_genome_it_ != parent_->ref_genome_->end() );
492 assert( ref_genome_it_->label() == current_locus_.chromosome );
500 current_locus_.position > 0 &&
501 current_locus_.position <= ref_genome_it_->length()
503 auto const ref_base =
utils::to_upper( ref_genome_it_->site_at( current_locus_.position - 1 ));
515 throw std::runtime_error(
516 "At chromosome \"" + current_locus_.chromosome +
"\" position " +
517 std::to_string( current_locus_.position ) +
", the reference genome has base '" +
518 std::string( 1, ref_base ) +
"', which is not a valid nucleic acid code"
522 throw std::runtime_error(
523 "At chromosome \"" + current_locus_.chromosome +
"\" position " +
525 ", the reference base in the data is '" +
526 std::string( 1, cur_var.reference_base ) +
"'. " +
527 "However, the reference genome has base '" + std::string( 1, ref_base ) +
528 "', which does not code for that base, and hence likely points to "
529 "some kind of mismatch"
541 void VariantGaplessInputStream::Iterator::check_input_iterator_()
543 if( iterator_->chromosome.empty() || iterator_->position == 0 ) {
544 throw std::runtime_error(
545 "In VariantGaplessInputStream: Invalid position "
546 "with empty chromosome name or zero position."
555 bool VariantGaplessInputStream::Iterator::current_locus_is_covered_by_genome_locus_set_()
558 assert( current_locus_.chromosome !=
"" );
559 assert( current_locus_.position > 0 );
562 if( ! parent_->genome_locus_set_ ) {
568 if( genome_locus_set_it_ == parent_->genome_locus_set_->end() ) {
574 assert( genome_locus_set_it_->first == current_locus_.chromosome );
575 return parent_->genome_locus_set_->is_covered( genome_locus_set_it_, current_locus_.position );