A library for working with phylogenetic and population genetic data.
v0.32.0
variant_gapless_input_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_STREAM_VARIANT_GAPLESS_INPUT_STREAM_H_
2 #define GENESIS_POPULATION_STREAM_VARIANT_GAPLESS_INPUT_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
42 
43 #include <cassert>
44 #include <functional>
45 #include <memory>
46 #include <stdexcept>
47 #include <string>
48 #include <unordered_set>
49 #include <utility>
50 #include <vector>
51 
52 namespace genesis {
53 namespace population {
54 
55 // =================================================================================================
56 // Variant Gapless Input Stream
57 // =================================================================================================
58 
95 {
96 public:
97 
98  // -------------------------------------------------------------------------
99  // Typedefs and Enums
100  // -------------------------------------------------------------------------
101 
104  using pointer = value_type const*;
105  using reference = value_type const&;
106  using difference_type = std::ptrdiff_t;
107  using iterator_category = std::input_iterator_tag;
108 
109  // ======================================================================================
110  // Internal Iterator
111  // ======================================================================================
112 
116  class Iterator
117  {
118  public:
119 
120  // -------------------------------------------------------------------------
121  // Constructors and Rule of Five
122  // -------------------------------------------------------------------------
123 
126  using pointer = value_type const*;
127  using reference = value_type const&;
128  using iterator_category = std::input_iterator_tag;
129 
130  public:
131 
132  Iterator() = default;
133 
134  private:
135 
137 
138  public:
139 
140  ~Iterator() = default;
141 
142  Iterator( self_type const& ) = default;
143  Iterator( self_type&& ) = default;
144 
145  Iterator& operator= ( self_type const& ) = default;
146  Iterator& operator= ( self_type&& ) = default;
147 
149 
150  // -------------------------------------------------------------------------
151  // Accessors
152  // -------------------------------------------------------------------------
153 
154  Variant const * operator->() const
155  {
156  return &current_variant_();
157  }
158 
159  // Variant * operator->()
160  // {
161  // return &current_variant_();
162  // }
163 
164  Variant const & operator*() const
165  {
166  return current_variant_();
167  }
168 
169  // Variant & operator*()
170  // {
171  // return current_variant_();
172  // }
173 
177  GenomeLocus const& locus() const
178  {
179  return current_locus_;
180  }
181 
182  // -------------------------------------------------------------------------
183  // Iteration
184  // -------------------------------------------------------------------------
185 
187  {
188  advance_();
189  return *this;
190  }
191 
192  // self_type operator ++(int)
193  // {
194  // auto cpy = *this;
195  // advance_();
196  // return cpy;
197  // }
198 
199  operator bool() const
200  {
201  return parent_ != nullptr;
202  }
203 
212  bool operator==( self_type const& it ) const
213  {
214  return parent_ == it.parent_;
215  }
216 
217  bool operator!=( self_type const& it ) const
218  {
219  return !(*this == it);
220  }
221 
222  // -------------------------------------------------------------------------
223  // Internal Members
224  // -------------------------------------------------------------------------
225 
226  private:
227 
231  void advance_();
232 
244  void init_chromosome_();
245 
254  void advance_current_locus_();
255 
268  void advance_current_locus_beyond_input_();
269 
274  bool has_more_ref_loci_on_current_chromosome_();
275 
287  std::string find_next_extra_chromosome_();
288 
295  void prepare_current_variant_();
296 
303  void prepare_current_variant_ref_base_();
304 
308  void check_input_iterator_();
309 
316  bool current_locus_is_covered_by_genome_locus_set_();
317 
323  Variant& current_variant_()
324  {
325  if( current_variant_is_missing_ ) {
326  return missing_variant_;
327  } else {
328  assert( iterator_ );
329  return *iterator_;
330  }
331  }
332 
338  Variant const& current_variant_() const
339  {
340  if( current_variant_is_missing_ ) {
341  return missing_variant_;
342  } else {
343  assert( iterator_ );
344  return *iterator_;
345  }
346  }
347 
348  private:
349 
350  // Parent
351  VariantGaplessInputStream* parent_ = nullptr;
352 
353  // Keep track of the locus that the iterator currently is at.
354  GenomeLocus current_locus_;
355 
356  // Is the current variant missing? If so, we are using the dummy missing_variant_,
357  // otherwise the one of the input iterator.
358  bool current_variant_is_missing_ = false;
359 
360  // Storage for the missing variants of the iterators. This serves as a dummy variant
361  // for all positions of the input without data, so that we do not need to re-allocate
362  // every time for this. It is initialized to zero counts for all base counts.
363  // In each iteration step, the position and refrence base are updated.
364  // To allow manupulation or move-from, we reset the whole Variant each time though.
365  // To this end, we store the number of samples here as well.
366  Variant missing_variant_;
367  size_t num_samples_;
368 
369  // Keep the iterators that we want to traverse. We only need the begin() iterators,
370  // as they are themselves able to tell us if they are still good (via their operator bool).
372 
373  // Cache for the ref genome and seq dict sequences, as well as the genome locus set,
374  // so that we do not have to find them on every iteration here.
378 
379  // We keep track of which chromosomes we have seen yet, in order to avoid errors due to
380  // unordered input, and also to know which chromosomes to process later for the case that
381  // iterate_extra_chromosomes() is set and there are unprocessed chromosomes left.
382  std::unordered_set<std::string> processed_chromosomes_;
383  };
384 
385  // ======================================================================================
386  // Main Class
387  // ======================================================================================
388 
389  // -------------------------------------------------------------------------
390  // Constructors and Rule of Five
391  // -------------------------------------------------------------------------
392 
393  VariantGaplessInputStream() = default;
394 
397  )
398  : input_( input )
399  {}
400 
403  )
404  : input_( std::move( input ))
405  {}
406 
407  ~VariantGaplessInputStream() = default;
408 
411 
414 
415  friend Iterator;
416 
417  // -------------------------------------------------------------------------
418  // Input
419  // -------------------------------------------------------------------------
420 
421  VariantInputStream const& input() const
422  {
423  return input_;
424  }
425 
427  {
428  return input_;
429  }
430 
431  // -------------------------------------------------------------------------
432  // Iteration
433  // -------------------------------------------------------------------------
434 
439  {
440  if( started_ ) {
441  throw std::runtime_error(
442  "VariantGaplessInputStream implements an input iterator (single pass); "
443  "begin() can hence not be called multiple times."
444  );
445  }
446  started_ = true;
447  return Iterator( this );
448  }
449 
454  {
455  return Iterator( nullptr );
456  }
457 
458  // ---------------------------------------------------------------------
459  // Settings
460  // ---------------------------------------------------------------------
461 
463  {
464  return iterate_extra_chromosomes_;
465  }
466 
476  {
477  if( started_ ) {
478  throw std::runtime_error(
479  "VariantGaplessInputStream::iterate_extra_chromosomes() cannot be called "
480  "once the iteration has been started with begin()."
481  );
482  }
483  iterate_extra_chromosomes_ = value;
484  return *this;
485  }
486 
490  std::shared_ptr<::genesis::sequence::ReferenceGenome> reference_genome() const
491  {
492  return ref_genome_;
493  }
494 
507  self_type& reference_genome( std::shared_ptr<::genesis::sequence::ReferenceGenome> value )
508  {
509  if( started_ ) {
510  throw std::runtime_error(
511  "VariantGaplessInputStream::reference_genome() cannot be called "
512  "once the iteration has been started with begin()."
513  );
514  }
515  if( value && seq_dict_ ) {
516  throw std::invalid_argument(
517  "Cannot set reference_genome() in VariantGaplessInputStream "
518  "when sequence_dict() is already provided."
519  );
520  }
521  ref_genome_ = value;
522  return *this;
523  }
524 
528  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict() const
529  {
530  return seq_dict_;
531  }
532 
541  self_type& sequence_dict( std::shared_ptr<genesis::sequence::SequenceDict> value )
542  {
543  if( started_ ) {
544  throw std::runtime_error(
545  "VariantGaplessInputStream::sequence_dict() cannot be called "
546  "once the iteration has been started with begin()."
547  );
548  }
549  if( value && ref_genome_ ) {
550  throw std::invalid_argument(
551  "Cannot set sequence_dict() in VariantGaplessInputStream "
552  "when reference_genome() is already provided."
553  );
554  }
555  seq_dict_ = value;
556  return *this;
557  }
558 
562  std::shared_ptr<GenomeLocusSet> genome_locus_set() const
563  {
564  return genome_locus_set_;
565  }
566 
589  self_type& genome_locus_set( std::shared_ptr<GenomeLocusSet> value )
590  {
591  if( started_ ) {
592  throw std::runtime_error(
593  "VariantGaplessInputStream::genome_locus_set() cannot be called "
594  "once the iteration has been started with begin()."
595  );
596  }
597  genome_locus_set_ = value;
598  return *this;
599  }
600 
601  // -------------------------------------------------------------------------
602  // Internal Members
603  // -------------------------------------------------------------------------
604 
605 private:
606 
607  VariantInputStream input_;
608  bool iterate_extra_chromosomes_ = true;
609  bool started_ = false;
610 
611  // We offer two ways of specifying chromosome lengths.
612  // With ref genome, we additionally gain access to the bases.
613  // Also, we here subset to regions if needed, to avoid unnecessary work later.
614  std::shared_ptr<::genesis::sequence::ReferenceGenome> ref_genome_;
615  std::shared_ptr<::genesis::sequence::SequenceDict> seq_dict_;
616  std::shared_ptr<GenomeLocusSet> genome_locus_set_;
617 
618 };
619 
620 } // namespace population
621 } // namespace genesis
622 
623 #endif // include guard
genesis::population::VariantGaplessInputStream::Iterator::operator!=
bool operator!=(self_type const &it) const
Definition: variant_gapless_input_stream.hpp:217
genesis::population::VariantGaplessInputStream::reference
value_type const & reference
Definition: variant_gapless_input_stream.hpp:105
genome_locus.hpp
genesis::population::VariantGaplessInputStream::Iterator::iterator_category
std::input_iterator_tag iterator_category
Definition: variant_gapless_input_stream.hpp:128
functions.hpp
genesis::population::VariantGaplessInputStream::sequence_dict
std::shared_ptr< genesis::sequence::SequenceDict > sequence_dict() const
Get the currently set sequence dictionary used for the chromosome lengths.
Definition: variant_gapless_input_stream.hpp:528
genesis::population::VariantGaplessInputStream::iterator_category
std::input_iterator_tag iterator_category
Definition: variant_gapless_input_stream.hpp:107
genesis::population::VariantGaplessInputStream::VariantGaplessInputStream
VariantGaplessInputStream()=default
genesis::population::VariantGaplessInputStream::operator=
VariantGaplessInputStream & operator=(VariantGaplessInputStream const &)=default
genome_locus.hpp
genesis::population::VariantGaplessInputStream::begin
Iterator begin()
Begin the iteration.
Definition: variant_gapless_input_stream.hpp:438
genesis::sequence::ReferenceGenome::const_iterator
typename std::vector< Sequence >::const_iterator const_iterator
Definition: reference_genome.hpp:74
genesis::population::VariantGaplessInputStream::input
VariantInputStream const & input() const
Definition: variant_gapless_input_stream.hpp:421
genesis::population::VariantGaplessInputStream::Iterator
friend Iterator
Definition: variant_gapless_input_stream.hpp:415
genesis::population::VariantGaplessInputStream::iterate_extra_chromosomes
self_type & iterate_extra_chromosomes(bool value)
Determine whether extra chromosomes without any data in the input are itereated.
Definition: variant_gapless_input_stream.hpp:475
genesis::population::VariantGaplessInputStream::Iterator::VariantGaplessInputStream
friend VariantGaplessInputStream
Definition: variant_gapless_input_stream.hpp:148
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:64
genesis::population::VariantGaplessInputStream::Iterator::~Iterator
~Iterator()=default
genesis::population::VariantGaplessInputStream::Iterator::pointer
value_type const * pointer
Definition: variant_gapless_input_stream.hpp:126
genesis::population::VariantGaplessInputStream::Iterator::operator==
bool operator==(self_type const &it) const
Compare two iterators for equality.
Definition: variant_gapless_input_stream.hpp:212
genesis::sequence::SequenceDict::const_iterator
std::vector< Entry >::const_iterator const_iterator
Definition: sequence_dict.hpp:109
genesis::population::VariantGaplessInputStream::iterate_extra_chromosomes
bool iterate_extra_chromosomes() const
Definition: variant_gapless_input_stream.hpp:462
genesis::population::VariantGaplessInputStream::genome_locus_set
std::shared_ptr< GenomeLocusSet > genome_locus_set() const
Get the currently set GenomeLocusSet for subsetting the iteration positions.
Definition: variant_gapless_input_stream.hpp:562
genesis::population::VariantGaplessInputStream::~VariantGaplessInputStream
~VariantGaplessInputStream()=default
genesis::population::VariantGaplessInputStream
Stream adapter that visits every position in the genome.
Definition: variant_gapless_input_stream.hpp:94
genesis::population::VariantGaplessInputStream::input
VariantInputStream & input()
Definition: variant_gapless_input_stream.hpp:426
genesis::population::VariantGaplessInputStream::Iterator::operator++
self_type & operator++()
Definition: variant_gapless_input_stream.hpp:186
sequence_dict.hpp
genesis::population::VariantGaplessInputStream::pointer
value_type const * pointer
Definition: variant_gapless_input_stream.hpp:104
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
reference_genome.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VariantGaplessInputStream::genome_locus_set
self_type & genome_locus_set(std::shared_ptr< GenomeLocusSet > value)
Set a genomic locus set for subsetting the iteration positions.
Definition: variant_gapless_input_stream.hpp:589
genome_locus_set.hpp
genesis::population::VariantGaplessInputStream::reference_genome
std::shared_ptr<::genesis::sequence::ReferenceGenome > reference_genome() const
Get the currently set reference genome to be used for the chromosome lengths and bases.
Definition: variant_gapless_input_stream.hpp:490
genesis::population::VariantGaplessInputStream::difference_type
std::ptrdiff_t difference_type
Definition: variant_gapless_input_stream.hpp:106
genesis::population::VariantGaplessInputStream::reference_genome
self_type & reference_genome(std::shared_ptr<::genesis::sequence::ReferenceGenome > value)
Set a reference genome to be used for the chromosome lengths and bases.
Definition: variant_gapless_input_stream.hpp:507
variant_input_stream.hpp
genesis::population::VariantGaplessInputStream::Iterator::Iterator
Iterator()=default
genesis::population::VariantGaplessInputStream::VariantGaplessInputStream
VariantGaplessInputStream(VariantInputStream const &input)
Definition: variant_gapless_input_stream.hpp:395
genesis::population::VariantGaplessInputStream::VariantGaplessInputStream
VariantGaplessInputStream(VariantInputStream &&input)
Definition: variant_gapless_input_stream.hpp:401
genesis::population::VariantGaplessInputStream::Iterator::operator=
Iterator & operator=(self_type const &)=default
genesis::population::VariantGaplessInputStream::Iterator
Iterator over loci of the input source.
Definition: variant_gapless_input_stream.hpp:116
genesis::population::VariantGaplessInputStream::end
Iterator end()
End marker for the iteration.
Definition: variant_gapless_input_stream.hpp:453
variant.hpp
genesis::population::VariantGaplessInputStream::Iterator::reference
value_type const & reference
Definition: variant_gapless_input_stream.hpp:127
genesis::population::VariantGaplessInputStream::Iterator::operator*
const Variant & operator*() const
Definition: variant_gapless_input_stream.hpp:164
genesis::utils::GenericInputStream< Variant, VariantInputStreamData >
genesis::utils::GenericInputStream< Variant, VariantInputStreamData >::Iterator
friend Iterator
Definition: generic_input_stream.hpp:846
genesis::population::VariantGaplessInputStream::Iterator::locus
GenomeLocus const & locus() const
Return the current locus where the iteration is at.
Definition: variant_gapless_input_stream.hpp:177
genesis::population::GenomeLocusSet::const_iterator
typename std::unordered_map< std::string, utils::Bitvector >::const_iterator const_iterator
Definition: genome_locus_set.hpp:91
genesis::population::VariantGaplessInputStream::Iterator::operator->
const Variant * operator->() const
Definition: variant_gapless_input_stream.hpp:154
genesis::population::VariantGaplessInputStream::sequence_dict
self_type & sequence_dict(std::shared_ptr< genesis::sequence::SequenceDict > value)
Set a sequence dictionary to be used for the chromosome lengths.
Definition: variant_gapless_input_stream.hpp:541