A library for working with phylogenetic and population genetic data.
v0.32.0
variant_gapless_input_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <iterator>
42 #include <numeric>
43 #include <stdexcept>
44 
45 namespace genesis {
46 namespace population {
47 
48 // =================================================================================================
49 // Iterator Constructors and Rule of Five
50 // =================================================================================================
51 
52 VariantGaplessInputStream::Iterator::Iterator( VariantGaplessInputStream* parent )
53  : parent_(parent)
54 {
55  // We use the parent as a check if this Iterator is intended to be a begin() or end()
56  // iterator. We here check this to avoid accidentally starting an iteration without data.
57  if( ! parent_ ) {
58  return;
59  }
60 
61  // Start the iteration, which already makes the first Variant ready in the input.
62  // We don't need to store the end, as the iterator itself is able to tell us that.
63  iterator_ = parent_->input_.begin();
64 
65  // We get the number of samples in the Variant to initialize the dummy Variant
66  // for missing positions where this iterator does not have data.
67  auto const sample_name_count = parent_->input_.data().sample_names.size();
68  if( iterator_ ) {
69  check_input_iterator_();
70  num_samples_ = iterator_->samples.size();
71 
72  // We assume that the sample_names are of the correct size, if given.
73  if( sample_name_count > 0 && iterator_->samples.size() != sample_name_count ) {
74  throw std::runtime_error(
75  "Input source for VariantGaplessInputStream contains " +
76  std::to_string( iterator_->samples.size() ) + " samples, but its sample " +
77  "name list contains " + std::to_string( sample_name_count ) + " names."
78  );
79  }
80 
81  // Now we want to start the iteration on the first chromosome where the iterator starts.
82  current_locus_ = GenomeLocus( iterator_->chromosome, 1 );
83  } else {
84  // If we have no data in the input at all (for instance, because of some aggressive
85  // filter settings), we use the sample names as an indicator for the number of dummy
86  // samples to create. This might still be needed when we want to iterator genome
87  // positions from the ref genome or sequence dict.
88  num_samples_ = sample_name_count;
89 
90  // We have no actual input data. Still let's see if there are extra chromosomes we want.
91  // We might not have anything, in which case we are done already.
92  auto const chr = find_next_extra_chromosome_();
93  if( chr.empty() ) {
94  parent_ = nullptr;
95  return;
96  }
97  current_locus_ = GenomeLocus( chr, 1 );
98  }
99 
100  // If we are here, we have initialized the current locus to the first position on some valid
101  // chromosome, and we can start the processing.
102  assert( current_locus_.chromosome != "" && current_locus_.position != 0 );
103  init_chromosome_();
104 
105  // We have just initialized the chrosome, including all cache pointer for the given references.
106  // We now use that to check that the position where we started is actually covered by the
107  // genome locus set filter. If not, we move on until we find a covered position.
108  // This is inefficient, as we are looping over gaps and data that might be filtered out
109  // immediately again. But it is good enough for now, and the use case is pretty special anyway.
110  if( current_locus_is_covered_by_genome_locus_set_() ) {
111  prepare_current_variant_();
112  } else {
113  // The advance function will loop until it finds a covered locus.
114  advance_();
115  }
116 }
117 
118 // =================================================================================================
119 // Iterator Internal Members
120 // =================================================================================================
121 
122 // -------------------------------------------------------------------------
123 // advance_
124 // -------------------------------------------------------------------------
125 
126 void VariantGaplessInputStream::Iterator::advance_()
127 {
128  // Some basic checks.
129  assert( parent_ );
130 
131  // We need to loop until we find a locus that is actually covered by the genome locus set.
132  // If that is not given anyway, this loop will immediately exit. But in case that we have
133  // a genome locus set filter, for simplicty, we have implemented this as a loop here.
134  // At least, we only do the work of preparing the variant at the very end, once we know
135  // that we want to visit it.
136  do {
137  // Move the current_locus_, and potentially the input iterator,
138  // to the next position we want to process.
139  advance_current_locus_();
140 
141  // If there is no next position, we are done.
142  if( current_locus_.empty() ) {
143  parent_ = nullptr;
144  return;
145  }
146  assert( current_locus_.chromosome != "" && current_locus_.position != 0 );
147 
148  // If the next position is the start of a chromosome,
149  // we need to set it up correctly first.
150  if( current_locus_.position == 1 ) {
151  init_chromosome_();
152  }
153  } while( ! current_locus_is_covered_by_genome_locus_set_() );
154 
155 
156  // Now we have everything to populate our variant as needed.
157  prepare_current_variant_();
158 }
159 
160 // -------------------------------------------------------------------------
161 // init_chromosome_
162 // -------------------------------------------------------------------------
163 
164 void VariantGaplessInputStream::Iterator::init_chromosome_()
165 {
166  // Check that we are not done yet (parent still valid), and that we have either
167  // a ref genome or a seq dict, but not both (neither is also fine).
168  assert( parent_ );
169  assert( !( parent_->ref_genome_ && parent_->seq_dict_ ));
170 
171  // Check that we are indeed at the beginning of a new chromosome.
172  assert( current_locus_.chromosome != "" );
173  assert( current_locus_.position == 1 );
174  std::string const& chr = current_locus_.chromosome;
175 
176  // Check that we do not accidentally duplicate any chromosomes.
177  if( processed_chromosomes_.count( chr ) > 0 ) {
178  throw std::runtime_error(
179  "In VariantGaplessInputStream: Chromosome \"" + chr + "\" occurs multiple times. "
180  "Likely, this means that the input is not sorted by chromosome and position."
181  );
182  }
183  processed_chromosomes_.insert( chr );
184 
185  // If we have a reference genome, set the cache value for fast finding of the sequence.
186  if( parent_->ref_genome_ ) {
187  ref_genome_it_ = parent_->ref_genome_->find( chr );
188  if( ref_genome_it_ == parent_->ref_genome_->end() ) {
189  throw std::runtime_error(
190  "In VariantGaplessInputStream: Chromosome \"" + chr + "\" requested "
191  "in the input data, which does not occur in the reference genome."
192  );
193  }
194  }
195 
196  // Same for sequence dict.
197  if( parent_->seq_dict_ ) {
198  seq_dict_it_ = parent_->seq_dict_->find( chr );
199  if( seq_dict_it_ == parent_->seq_dict_->end() ) {
200  throw std::runtime_error(
201  "In VariantGaplessInputStream: Chromosome \"" + chr + "\" requested "
202  "in the input data, which does not occur in the sequence dictionary."
203  );
204  }
205  }
206 
207  // Lastly, for the genome locus set, we do the same check, but slightly different:
208  // The set might not contain chromosomes that are filtered out completely anyway,
209  // and in that case we do not want to fail, but just indicate that be setting the
210  // cache iterator to the end. That is done by the find function anyway,
211  // so no further steps or checks are needed in that case.
212  if( parent_->genome_locus_set_ ) {
213  genome_locus_set_it_ = parent_->genome_locus_set_->find( chr );
214  }
215 }
216 
217 // -------------------------------------------------------------------------
218 // advance_current_locus_
219 // -------------------------------------------------------------------------
220 
221 void VariantGaplessInputStream::Iterator::advance_current_locus_()
222 {
223  // If we have no more input data, we are processing positions (and potential extra chromosomes)
224  // as provided by the ref genome or seq dict.
225  if( !iterator_ ) {
226  advance_current_locus_beyond_input_();
227  return;
228  }
229 
230  // If the input data is at exactly where we are in our iteration (i.e., there was data
231  // for the current position), we need to advance the iterator. That could lead to its end,
232  // in which case we do the same as above: See if there are positions beyond.
233  // If this is not the case, the iterator is somewhere ahead of us here, and so we just leave
234  // it there until we reach its position (in which case the condition here will then trigger).
235  if( iterator_->chromosome == current_locus_.chromosome ) {
236  assert( iterator_->position >= current_locus_.position );
237  if( iterator_->position == current_locus_.position ) {
238  ++iterator_;
239  if( !iterator_ ) {
240  advance_current_locus_beyond_input_();
241  return;
242  }
243  check_input_iterator_();
244  }
245  }
246  assert( iterator_ );
247 
248  // If the iterator still has data on the chromosome, or the ref genome or seq dict has,
249  // we just move a position forward. We here do not care if the iterator actually has
250  // data for that next position; this is checked when populating the data. All we need
251  // to know here is that there will be some more data at some point on this chromosome.
252  // If not, we start a new chromosome.
253  if(
254  iterator_->chromosome == current_locus_.chromosome ||
255  has_more_ref_loci_on_current_chromosome_()
256  ) {
257  ++current_locus_.position;
258  } else {
259  current_locus_ = GenomeLocus( iterator_->chromosome, 1 );
260  }
261 }
262 
263 // -------------------------------------------------------------------------
264 // advance_current_locus_beyond_input_
265 // -------------------------------------------------------------------------
266 
267 void VariantGaplessInputStream::Iterator::advance_current_locus_beyond_input_()
268 {
269  // Assumptions about the caller. We only get called when there is no more data in the iterator,
270  // but we are not yet fully done with the ref genome or seq dict extra chromosomes.
271  assert( parent_ );
272  assert( !iterator_ );
273 
274  // We first check if the next incremental position is still valid according to the
275  // ref genome or seq dict. If so, we just move to it and are done.
276  if( has_more_ref_loci_on_current_chromosome_() ) {
277  ++current_locus_.position;
278  return;
279  }
280 
281  // Once we are here, we have processed a chromosome and might want to move to the next.
282  // As we are already beyond the input data when this function is called, this can only
283  // mean that we want to check for extra chromosomes that are only in the ref genome or
284  // seq dict, but not in the input data. Check if we want to do that at all.
285  if( !parent_->iterate_extra_chromosomes_ ) {
286  current_locus_.clear();
287  return;
288  }
289 
290  // If not, we reached the end of one extra chr and want to move to the next,
291  // or if there isn't any, indicate to the caller that we are done.
292  auto const next_chr = find_next_extra_chromosome_();
293  if( next_chr.empty() ) {
294  current_locus_.clear();
295  } else {
296  current_locus_.chromosome = next_chr;
297  current_locus_.position = 1;
298  }
299 }
300 
301 // -------------------------------------------------------------------------
302 // has_more_ref_loci_on_current_chromosome_
303 // -------------------------------------------------------------------------
304 
305 bool VariantGaplessInputStream::Iterator::has_more_ref_loci_on_current_chromosome_()
306 {
307  assert( parent_ );
308  assert( !( parent_->ref_genome_ && parent_->seq_dict_ ));
309 
310  // Check if there is a next position on the chromosome that we are currently at.
311  // Positions are 1-based, so we need smaller or equal comparison here.
312  // If neither ref genome nor seq dict are given, we just return false.
313  if( parent_->ref_genome_ ) {
314  assert( ref_genome_it_ != parent_->ref_genome_->end() );
315  assert( ref_genome_it_->label() == current_locus_.chromosome );
316  if( current_locus_.position + 1 <= ref_genome_it_->size() ) {
317  return true;
318  }
319  }
320  if( parent_->seq_dict_ ) {
321  assert( seq_dict_it_ != parent_->seq_dict_->end() );
322  assert( seq_dict_it_->name == current_locus_.chromosome );
323  if( current_locus_.position + 1 <= seq_dict_it_->length ) {
324  return true;
325  }
326  }
327  return false;
328 }
329 
330 // -------------------------------------------------------------------------
331 // find_next_extra_chromosome_
332 // -------------------------------------------------------------------------
333 
334 std::string VariantGaplessInputStream::Iterator::find_next_extra_chromosome_()
335 {
336  assert( parent_ );
337 
338  // We might not want to do extra chromosomes at all.
339  if( !parent_->iterate_extra_chromosomes_ ) {
340  return "";
341  }
342 
343  // Check for extra ref genome chromosomes.
344  if( parent_->ref_genome_ ) {
345  // Check if there are any more chromosomes that we have not done yet.
346  // During the nornal iteration with data, we always check that a chromosome that is found
347  // in the data also is in the ref genome (or seq dict, same there below). So, when we reach
348  // the end of the data, and then process the extra chromosomes here, we can be sure that
349  // there are no chromosomes in the data that are not also in the ref genome - hence, the
350  // size check here works to test completness of the chromosomes.
351  if( processed_chromosomes_.size() == parent_->ref_genome_->size() ) {
352  return "";
353  }
354 
355  // If yes, find the next one.
356  for( auto const& ref_chr : *parent_->ref_genome_ ) {
357  if( ref_chr.label().empty() ) {
358  throw std::runtime_error(
359  "Invalid empty chromosome name in reference genome."
360  );
361  }
362  if( processed_chromosomes_.count( ref_chr.label() ) == 0 ) {
363  return ref_chr.label();
364  }
365  }
366 
367  // We are guaranteed to find one, so this here cannot be reached.
368  assert( false );
369  }
370 
371  // Same for extra seq dict chromosomes.
372  // Unfortunate code duplication due to the slightly different interfaces.
373  if( parent_->seq_dict_ ) {
374  // Check if there are any more chromosomes that we have not done yet.
375  if( processed_chromosomes_.size() == parent_->seq_dict_->size() ) {
376  return "";
377  }
378 
379  // If yes, find the next one.
380  for( auto const& entry : *parent_->seq_dict_ ) {
381  if( entry.name.empty() ) {
382  throw std::runtime_error(
383  "Invalid empty chromosome name in sequence dictionary."
384  );
385  }
386  if( processed_chromosomes_.count( entry.name ) == 0 ) {
387  return entry.name;
388  }
389  }
390 
391  // We are guaranteed to find one, so this here cannot be reached.
392  assert( false );
393  }
394 
395  // If neither is given, just return an empty string to indiate that we do not have
396  // any extra chromosomes to process.
397  return "";
398 }
399 
400 // -------------------------------------------------------------------------
401 // prepare_current_variant_
402 // -------------------------------------------------------------------------
403 
404 void VariantGaplessInputStream::Iterator::prepare_current_variant_()
405 {
406  // We expect to be at a valid current locus.
407  assert( parent_ );
408  assert( current_locus_.chromosome != "" && current_locus_.position != 0 );
409 
410  // Check that the current locus is valid according to the ref genome or seq dict.
411  if( parent_->ref_genome_ ) {
412  assert( ref_genome_it_ != parent_->ref_genome_->end() );
413  assert( ref_genome_it_->label() == current_locus_.chromosome );
414 
415  // We use 1-based positions here, hence the greater-than comparison.
416  if( current_locus_.position > ref_genome_it_->length() ) {
417  throw std::runtime_error(
418  "In VariantGaplessInputStream: Invalid input data, which has data "
419  "beyond the reference genome at " + to_string( current_locus_ )
420  );
421  }
422  }
423  if( parent_->seq_dict_ ) {
424  assert( seq_dict_it_ != parent_->seq_dict_->end() );
425  assert( seq_dict_it_->name == current_locus_.chromosome );
426  if( current_locus_.position > seq_dict_it_->length ) {
427  throw std::runtime_error(
428  "In VariantGaplessInputStream: Invalid input data, which has data "
429  "beyond the reference genome at " + to_string( current_locus_ )
430  );
431  }
432  }
433 
434  // Check if the current locus has data. If so, we point our data to the input data.
435  // If not, we point to our internal "missing" variant dummy, and reset it from prev iterations.
436  if( iterator_ && locus_equal( iterator_->chromosome, iterator_->position, current_locus_ )) {
437  current_variant_is_missing_ = false;
438 
439  // Error check for consistent sample size.
440  if( iterator_->samples.size() != num_samples_ ) {
441  throw std::runtime_error(
442  "In VariantGaplessInputStream: Invalid input data that has an inconsistent "
443  "number of samples throughout, first occurring at " + to_string( current_locus_ ) +
444  ". Expected " + std::to_string( num_samples_ ) +
445  " samples based on first iteration, but found " +
446  std::to_string( iterator_->samples.size() ) + " samples instead."
447  );
448  }
449  } else {
450  current_variant_is_missing_ = true;
451  missing_variant_.chromosome = current_locus_.chromosome;
452  missing_variant_.position = current_locus_.position;
453  missing_variant_.status.reset( VariantFilterTag::kMissing );
454  missing_variant_.reference_base = 'N';
455  missing_variant_.alternative_base = 'N';
456 
457  // In case that the Variant is moved-from, we need to reset the sample size. In case it was
458  // modified (by some filter or transformation), we also need to reset the counts.
459  missing_variant_.samples.resize( num_samples_ );
460  for( auto& sample : missing_variant_.samples ) {
461  sample = SampleCounts();
462  sample.status.reset( SampleCountsFilterTag::kMissing );
463  }
464  }
465 
466  prepare_current_variant_ref_base_();
467 }
468 
469 // -------------------------------------------------------------------------
470 // prepare_current_variant_ref_base_
471 // -------------------------------------------------------------------------
472 
473 void VariantGaplessInputStream::Iterator::prepare_current_variant_ref_base_()
474 {
475  // Shorthand. Points to either the missing variant or the input iterator variant.
476  auto& cur_var = current_variant_();
477 
478  // This function expects current_variant to be set up for the locus already,
479  // and only be missing the ref base setup.
480  assert( parent_ );
481  assert( !current_locus_.chromosome.empty() && current_locus_.position > 0 );
482  assert( locus_equal(
483  cur_var.chromosome, cur_var.position, current_locus_
484  ));
485 
486  // If we have a ref genome, we use it to get or check the reference base.
487  // If not, we are done.
488  if( !parent_->ref_genome_ ) {
489  return;
490  }
491  assert( ref_genome_it_ != parent_->ref_genome_->end() );
492  assert( ref_genome_it_->label() == current_locus_.chromosome );
493 
494  // Get the reference base and check it against the Variant.
495  // This throws an exception for mismatches; sets the ref and alt base if not
496  // already done so; does nothing if already set in the Variant.
497  // We use 1-based positions, but the ref genome is a simple sequence in string
498  // format, so we need to offset by one here.
499  assert(
500  current_locus_.position > 0 &&
501  current_locus_.position <= ref_genome_it_->length()
502  );
503  auto const ref_base = utils::to_upper( ref_genome_it_->site_at( current_locus_.position - 1 ));
504  // guess_and_set_ref_and_alt_bases( cur_var, ref_base, true );
505  if( is_valid_base( utils::to_upper( cur_var.reference_base ))) {
506  bool contains = false;
507  try {
510  ref_base, utils::to_upper( cur_var.reference_base )
511  );
512  } catch(...) {
513  // The above throws an error if the given bases are not valid.
514  // Catch this, and re-throw a nicer, more understandable exception instead.
515  throw std::runtime_error(
516  "At chromosome \"" + current_locus_.chromosome + "\" position " +
517  std::to_string( current_locus_.position ) + ", the reference genome has base '" +
518  std::string( 1, ref_base ) + "', which is not a valid nucleic acid code"
519  );
520  }
521  if( ! contains ) {
522  throw std::runtime_error(
523  "At chromosome \"" + current_locus_.chromosome + "\" position " +
524  std::to_string( current_locus_.position ) +
525  ", the reference base in the data is '" +
526  std::string( 1, cur_var.reference_base ) + "'. " +
527  "However, the reference genome has base '" + std::string( 1, ref_base ) +
528  "', which does not code for that base, and hence likely points to "
529  "some kind of mismatch"
530  );
531  }
532  } else {
533  cur_var.reference_base = utils::to_upper( ref_base );
534  }
535 }
536 
537 // -------------------------------------------------------------------------
538 // check_input_iterator_
539 // -------------------------------------------------------------------------
540 
541 void VariantGaplessInputStream::Iterator::check_input_iterator_()
542 {
543  if( iterator_->chromosome.empty() || iterator_->position == 0 ) {
544  throw std::runtime_error(
545  "In VariantGaplessInputStream: Invalid position "
546  "with empty chromosome name or zero position."
547  );
548  }
549 }
550 
551 // -------------------------------------------------------------------------
552 // current_locus_is_covered_by_genome_locus_set_
553 // -------------------------------------------------------------------------
554 
555 bool VariantGaplessInputStream::Iterator::current_locus_is_covered_by_genome_locus_set_()
556 {
557  assert( parent_ );
558  assert( current_locus_.chromosome != "" );
559  assert( current_locus_.position > 0 );
560 
561  // Without a given genome locus set, we always consider this position to be covered.
562  if( ! parent_->genome_locus_set_ ) {
563  return true;
564  }
565 
566  // If we are here, we have a genome locus set filter given.
567  // If the chromosome is not in the set, that means that the position is not covered.
568  if( genome_locus_set_it_ == parent_->genome_locus_set_->end() ) {
569  return false;
570  }
571 
572  // If it contains the given chromosome, we use that to determine if the locus is covered.
573  // We assume that the iterator is at the current chromosome.
574  assert( genome_locus_set_it_->first == current_locus_.chromosome );
575  return parent_->genome_locus_set_->is_covered( genome_locus_set_it_, current_locus_.position );
576 }
577 
578 } // namespace population
579 } // namespace genesis
genesis::population::locus_equal
bool locus_equal(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Equality comparison (==) for two loci in a genome.
Definition: function/genome_locus.hpp:300
functions.hpp
genesis::sequence::nucleic_acid_code_containment
bool nucleic_acid_code_containment(char a, char b, bool undetermined_matches_all)
Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters i...
Definition: codes.cpp:596
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
sample_counts_filter.hpp
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::is_valid_base
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
Definition: population/function/functions.hpp:56
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
variant_filter.hpp
char.hpp
genesis::population::VariantGaplessInputStream::Iterator::Iterator
Iterator()=default
variant_gapless_input_stream.hpp
genesis::utils::contains
bool contains(const C &v, const T &x)
Returns whether a container object has a certain element.
Definition: algorithm.hpp:74
codes.hpp