A library for working with phylogenetic and population genetic data.
v0.27.0
variant_parallel_input_iterator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_VARIANT_PARALLEL_INPUT_ITERATOR_H_
2 #define GENESIS_POPULATION_FORMATS_VARIANT_PARALLEL_INPUT_ITERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
40 
41 #include <cassert>
42 #include <functional>
43 #include <set>
44 #include <stdexcept>
45 #include <string>
46 #include <utility>
47 #include <vector>
48 
49 namespace genesis {
50 namespace population {
51 
52 // =================================================================================================
53 // Variant Parallel Input Iterator
54 // =================================================================================================
55 
132 {
133 public:
134 
135  // -------------------------------------------------------------------------
136  // Typedefs and Enums
137  // -------------------------------------------------------------------------
138 
169  enum class ContributionType
170  {
177  kCarrying,
178 
187  kFollowing
188  };
189 
192 
193  // ======================================================================================
194  // Internal Iterator
195  // ======================================================================================
196 
205  class Iterator
206  {
207  public:
208 
209  // -------------------------------------------------------------------------
210  // Constructors and Rule of Five
211  // -------------------------------------------------------------------------
212 
214  using value_type = std::vector<utils::Optional<Variant>>;
215  using pointer = value_type const*;
216  using reference = value_type const&;
217  using iterator_category = std::input_iterator_tag;
218 
219  private:
220 
221  Iterator() = default;
223 
224  public:
225 
226  ~Iterator() = default;
227 
228  Iterator( self_type const& ) = default;
229  Iterator( self_type&& ) = default;
230 
231  Iterator& operator= ( self_type const& ) = default;
232  Iterator& operator= ( self_type&& ) = default;
233 
235 
236  // -------------------------------------------------------------------------
237  // Accessors
238  // -------------------------------------------------------------------------
239 
240  std::vector<utils::Optional<Variant>> const * operator->() const
241  {
242  return &variants_;
243  }
244 
245  std::vector<utils::Optional<Variant>> * operator->()
246  {
247  return &variants_;
248  }
249 
250  std::vector<utils::Optional<Variant>> const & operator*() const
251  {
252  return variants_;
253  }
254 
255  std::vector<utils::Optional<Variant>> & operator*()
256  {
257  return variants_;
258  }
259 
266  std::vector<utils::Optional<Variant>> const& variants() const
267  {
268  return variants_;
269  }
270 
274  std::vector<utils::Optional<Variant>>& variants()
275  {
276  return variants_;
277  }
278 
286  std::vector<VariantInputIterator> const& inputs() const
287  {
288  // We assume that the user only does this when the iterator is not an end() iterator.
289  assert( generator_ );
290  return generator_->inputs_;
291  }
292 
300  VariantInputIterator const& input_at( size_t index ) const
301  {
302  return generator_->inputs_[index];
303  }
304 
314  utils::Optional<Variant> const& variant_at( size_t index ) const
315  {
316  // Return with boundary check.
317  return variants_.at( index );
318  }
319 
324  {
325  // Return with boundary check.
326  return variants_.at( index );
327  }
328 
354  bool allow_ref_base_mismatches = false,
355  bool allow_alt_base_mismatches = true,
356  bool move_samples = false
357  );
358 
362  GenomeLocus const& locus() const
363  {
364  return current_locus_;
365  }
366 
367  // -------------------------------------------------------------------------
368  // Iteration
369  // -------------------------------------------------------------------------
370 
372  {
373  advance_();
374  return *this;
375  }
376 
378  {
379  auto cpy = *this;
380  advance_();
381  return cpy;
382  }
383 
384  operator bool() const
385  {
386  return generator_ != nullptr;
387  }
388 
397  bool operator==( self_type const& it ) const
398  {
399  return generator_ == it.generator_;
400  }
401 
402  bool operator!=( self_type const& it ) const
403  {
404  return !(*this == it);
405  }
406 
407  // -------------------------------------------------------------------------
408  // Internal Members
409  // -------------------------------------------------------------------------
410 
411  private:
412 
416  void advance_()
417  {
418  // Some basic checks.
419  assert( generator_ );
420  assert( generator_->inputs_.size() == generator_->selections_.size() );
421  assert( generator_->inputs_.size() == iterators_.size() );
422 
423  // Depending on what type of inputs we have, we need two different algorithms
424  // to find the next position to iterate to.
425  if( generator_->has_carrying_input_ ) {
426  advance_using_carrying_();
427  } else {
428  advance_using_only_following_();
429  }
430  }
431 
435  void advance_using_carrying_();
436 
443  void advance_using_only_following_();
444 
449  void increment_iterator_( VariantInputIterator::Iterator& iterator );
450 
455  void assert_correct_chr_and_pos_( VariantInputIterator::Iterator const& iterator );
456 
462  void update_variants_();
463 
464  private:
465 
466  // Parent
467  VariantParallelInputIterator* generator_ = nullptr;
468 
469  // Keep track of the locus that the iterator currently is at.
470  // Not all sources have to be there (if they don't have data for that locus), in which case
471  // we want them to be at the next position in their data beyond the current locus.
472  GenomeLocus current_locus_;
473 
474  // Keep the iterators that we want to traverse. We only need the begin() iteratos,
475  // as they are themselves able to tell us if they are still good (via their operator bool).
476  std::vector<VariantInputIterator::Iterator> iterators_;
477 
478  // We need to store how many samples (BaseCounts objects) the Variant of each iterator has,
479  // in order to fill in the empty ones at the iterator positions where they don't have data.
480  // We cannot always look that up from the iterators themselves, as they might already have
481  // reached their end of the data while others are still having data, so we store it here.
482  std::vector<size_t> variant_sizes_;
483  size_t variant_size_sum_;
484 
485  // Storage for the variants of the iterators. We need these copies, as not all iterators
486  // are expected to have all loci in the genome, so if we'd instead gave access to the
487  // iterators directly to the user of this class, they'd have to check if the iterator is at
488  // the correct locus, and so on. So instead, we offer a user-friendly interface that they
489  // can simply iterator over and check if the optional is empty or not. Bit of copying,
490  // but then again, each layer of abstraction comes at some cost...
491  // At least, we move (not copy) data into here, for efficiency.
492  std::vector<utils::Optional<Variant>> variants_;
493 
494  // Store the current additional carrying locus that we are at (if those have been
495  // added; if not, we just store the end iterator here).
496  std::set<GenomeLocus>::const_iterator carrying_locus_it_;
497 
498  };
499 
500  // ======================================================================================
501  // Main Class
502  // ======================================================================================
503 
504  // -------------------------------------------------------------------------
505  // Constructors and Rule of Five
506  // -------------------------------------------------------------------------
507 
508  VariantParallelInputIterator() = default;
509  ~VariantParallelInputIterator() = default;
510 
511  VariantParallelInputIterator( self_type const& ) = default;
513 
514  self_type& operator= ( self_type const& ) = default;
515  self_type& operator= ( self_type&& ) = default;
516 
517  friend Iterator;
518 
519  // -------------------------------------------------------------------------
520  // Iteration
521  // -------------------------------------------------------------------------
522 
530  {
531  return Iterator( this );
532  }
533 
538  {
539  return Iterator( nullptr );
540  }
541 
542  // -------------------------------------------------------------------------
543  // Input Sources
544  // -------------------------------------------------------------------------
545 
550  VariantInputIterator const& input,
551  ContributionType selection
552  ) {
553  inputs_.emplace_back( input );
554  selections_.emplace_back( selection );
555  assert( inputs_.size() == selections_.size() );
556 
557  if( selection == ContributionType::kCarrying ) {
558  has_carrying_input_ = true;
559  }
560  return *this;
561  }
562 
570  std::function<bool(Variant&)> input_element_generator,
571  ContributionType selection
572  ) {
573  add_variant_input_iterator( VariantInputIterator( input_element_generator ), selection );
574  return *this;
575  }
576 
580  std::vector<VariantInputIterator> const& inputs() const
581  {
582  return inputs_;
583  }
584 
595  std::vector<VariantInputIterator>& inputs()
596  {
597  return inputs_;
598  }
599 
603  VariantInputIterator const& input_at( size_t index ) const
604  {
605  return inputs_[index];
606  }
607 
612  {
613  return inputs_[index];
614  }
615 
619  size_t input_size() const
620  {
621  assert( inputs_.size() == selections_.size() );
622  return inputs_.size();
623  }
624 
625  // -------------------------------------------------------------------------
626  // Input Loci
627  // -------------------------------------------------------------------------
628 
651  {
652  // Error check.
653  if( locus.chromosome.empty() || locus.position == 0 ) {
654  throw std::invalid_argument(
655  "Cannot add a carrying locus with empty chromosome or position 0 "
656  "to VariantParallelInputIterator"
657  );
658  }
659 
660  // Add to the list. Also, if loci are added with this function, these serve as carrying loci,
661  // and so we can always use advance_using_carrying_() to find the next locus;
662  // mark this by setting has_carrying_input_.
663  carrying_loci_.insert( locus );
664  has_carrying_input_ = true;
665  return *this;
666  }
667 
674  self_type& add_carrying_loci( std::vector<GenomeLocus> const& loci )
675  {
676  add_carrying_loci( loci.begin(), loci.end() );
677  return *this;
678  }
679 
683  template<class ForwardIterator>
684  self_type& add_carrying_loci( ForwardIterator first, ForwardIterator last )
685  {
686  while( first != last ) {
687  add_carrying_locus( *first );
688  ++first;
689  }
690 
691  // Version for if we wanted to switch the set for a vector.
692  // Sort the list of loci. All this is so inefficient, as we store the chromosome names
693  // again and again for each locus. The sorting is okay though, we need to have that
694  // complexity somewhere - using a std::set for example would just shift the place where
695  // we do the sorting, but would make iteration a bit more tricky, and would need even more
696  // memory.
697  // std::sort( carrying_loci_.begin(), carrying_loci_.end() );
698  // carrying_loci_.erase(
699  // std::unique( carrying_loci_.begin(), carrying_loci_.end() ),
700  // carrying_loci_.end()
701  // );
702 
703  return *this;
704  }
705 
706  // -------------------------------------------------------------------------
707  // Data Members
708  // -------------------------------------------------------------------------
709 
710 private:
711 
712  // Store all input sources, as well as the type (carrying or following) of how we want
713  // to traverse them. We keep track whether at least one of them is of type carrying.
714  // If not (all following), the advance function of the iterator needs to be special.
715  std::vector<VariantInputIterator> inputs_;
716  std::vector<ContributionType> selections_;
717  bool has_carrying_input_ = false;
718 
719  // Store all additional loci that we want to include as stops in the iterator.
720  // Memory-wise, this is highly inefficient, as we store the chromosome name for each of them.
721  // But for now, this is easiest and fastest. We use a set, so that adding loci one after another
722  // always results in a sorted container, without having to re-sort every time.
723  // This again has a bit of a higher memory impact, but that should be okay for now.
724  std::set<GenomeLocus> carrying_loci_;
725 
726 };
727 
728 } // namespace population
729 } // namespace genesis
730 
731 #endif // include guard
genesis::utils::LambdaIterator
Type erasure for iterators, using std::function to eliminate the underlying input type.
Definition: lambda_iterator.hpp:150
base_counts.hpp
genesis::population::VariantParallelInputIterator::Iterator::operator*
const std::vector< utils::Optional< Variant > > & operator*() const
Definition: variant_parallel_input_iterator.hpp:250
genesis::population::VariantParallelInputIterator::inputs
std::vector< VariantInputIterator > const & inputs() const
Get access to the input iterators that have been added to this parallel iterator.
Definition: variant_parallel_input_iterator.hpp:580
genesis::population::VariantParallelInputIterator::Iterator::inputs
std::vector< VariantInputIterator > const & inputs() const
Get access to the input iterators that have been added to this parallel iterator.
Definition: variant_parallel_input_iterator.hpp:286
genesis::population::VariantParallelInputIterator::ContributionType
ContributionType
Select which loci of an input are used.
Definition: variant_parallel_input_iterator.hpp:169
genesis::population::VariantParallelInputIterator::Iterator::iterator_category
std::input_iterator_tag iterator_category
Definition: variant_parallel_input_iterator.hpp:217
genesis::population::VariantParallelInputIterator
Iterate multiple input sources that yield Variants in parallel.
Definition: variant_parallel_input_iterator.hpp:131
genesis::utils::LambdaIterator::Iterator
friend Iterator
Definition: lambda_iterator.hpp:714
genesis::population::VariantParallelInputIterator::inputs
std::vector< VariantInputIterator > & inputs()
Get access to the input iterators that have been added to this parallel iterator.
Definition: variant_parallel_input_iterator.hpp:595
genesis::population::GenomeLocus::position
size_t position
Definition: genome_locus.hpp:59
genome_locus.hpp
genesis::population::VariantParallelInputIterator::Iterator::joined_variant
Variant joined_variant(bool allow_ref_base_mismatches=false, bool allow_alt_base_mismatches=true, bool move_samples=false)
Create a single Variant instance that combines all Variants from the input sources at the current loc...
Definition: variant_parallel_input_iterator.cpp:125
genesis::population::VariantParallelInputIterator::Iterator::operator==
bool operator==(self_type const &it) const
Compare two iterators for equality.
Definition: variant_parallel_input_iterator.hpp:397
genesis::population::VariantParallelInputIterator::add_carrying_loci
self_type & add_carrying_loci(std::vector< GenomeLocus > const &loci)
Add a set of GenomeLoci that are used as carrying loci in the iteration.
Definition: variant_parallel_input_iterator.hpp:674
genesis::population::VariantParallelInputIterator::input_at
VariantInputIterator & input_at(size_t index)
Get access to an input iterator that has been added to this parallel iterator.
Definition: variant_parallel_input_iterator.hpp:611
genesis::population::VariantParallelInputIterator::Iterator::operator->
const std::vector< utils::Optional< Variant > > * operator->() const
Definition: variant_parallel_input_iterator.hpp:240
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:56
genesis::population::VariantParallelInputIterator::Iterator::variants
std::vector< utils::Optional< Variant > > const & variants() const
Return the data of all input iterators at the current locus.
Definition: variant_parallel_input_iterator.hpp:266
genesis::population::VariantParallelInputIterator::Iterator::~Iterator
~Iterator()=default
genesis::population::VariantParallelInputIterator::self_type
VariantParallelInputIterator self_type
Definition: variant_parallel_input_iterator.hpp:190
genesis::population::GenomeLocus::chromosome
std::string chromosome
Definition: genome_locus.hpp:58
genesis::population::VariantParallelInputIterator::Iterator::locus
GenomeLocus const & locus() const
Return the current locus where the iteration is at.
Definition: variant_parallel_input_iterator.hpp:362
genome_locus.hpp
genesis::utils::Optional
Simplistic optional: requires T to be default constructible, copyable.
Definition: optional.hpp:178
genesis::population::VariantParallelInputIterator::Iterator::operator=
Iterator & operator=(self_type const &)=default
genesis::population::VariantParallelInputIterator::Iterator::pointer
value_type const * pointer
Definition: variant_parallel_input_iterator.hpp:215
genesis::population::VariantParallelInputIterator::VariantParallelInputIterator
VariantParallelInputIterator()=default
genesis::population::VariantParallelInputIterator::operator=
self_type & operator=(self_type const &)=default
genesis::population::VariantParallelInputIterator::ContributionType::kCarrying
@ kCarrying
For a given input, stop at all its positions.
genesis::population::VariantParallelInputIterator::Iterator::input_at
VariantInputIterator const & input_at(size_t index) const
Get access to an input iterator that has been added to this parallel iterator.
Definition: variant_parallel_input_iterator.hpp:300
genesis::population::VariantParallelInputIterator::ContributionType::kFollowing
@ kFollowing
For a given input, only stop at positions where other inputs also want to stop.
genesis::population::VariantParallelInputIterator::Iterator::reference
value_type const & reference
Definition: variant_parallel_input_iterator.hpp:216
genesis::population::VariantParallelInputIterator::Iterator
friend Iterator
Definition: variant_parallel_input_iterator.hpp:517
genesis::population::VariantParallelInputIterator::begin
Iterator begin()
Begin the iteration.
Definition: variant_parallel_input_iterator.hpp:529
genesis::population::VariantParallelInputIterator::Iterator::operator!=
bool operator!=(self_type const &it) const
Definition: variant_parallel_input_iterator.hpp:402
genesis::population::VariantParallelInputIterator::Iterator
Iterator over loci of the input sources.
Definition: variant_parallel_input_iterator.hpp:205
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis::population::VariantParallelInputIterator::end
Iterator end()
End marker for the iteration.
Definition: variant_parallel_input_iterator.hpp:537
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::VariantParallelInputIterator::input_at
VariantInputIterator const & input_at(size_t index) const
Get access to an input iterator that has been added to this parallel iterator.
Definition: variant_parallel_input_iterator.hpp:603
genesis::population::VariantParallelInputIterator::input_size
size_t input_size() const
Return the number of input sourced added.
Definition: variant_parallel_input_iterator.hpp:619
variant_input_iterator.hpp
genesis::population::VariantParallelInputIterator::Iterator::variants
std::vector< utils::Optional< Variant > > & variants()
Return the data of all input iterators at the current locus.
Definition: variant_parallel_input_iterator.hpp:274
genesis::population::VariantParallelInputIterator::Iterator::operator->
std::vector< utils::Optional< Variant > > * operator->()
Definition: variant_parallel_input_iterator.hpp:245
genesis::population::VariantParallelInputIterator::add_carrying_locus
self_type & add_carrying_locus(GenomeLocus const &locus)
Add a set of GenomeLoci that are used as carrying loci in the iteration.
Definition: variant_parallel_input_iterator.hpp:650
genesis::population::VariantParallelInputIterator::Iterator::VariantParallelInputIterator
friend VariantParallelInputIterator
Definition: variant_parallel_input_iterator.hpp:234
genesis::population::VariantParallelInputIterator::Iterator::operator++
self_type & operator++()
Definition: variant_parallel_input_iterator.hpp:371
genesis::population::VariantParallelInputIterator::add_variant_input
self_type & add_variant_input(std::function< bool(Variant &)> input_element_generator, ContributionType selection)
Add an input to the parallel iterator.
Definition: variant_parallel_input_iterator.hpp:569
genesis::population::VariantParallelInputIterator::Iterator::variant_at
utils::Optional< Variant > & variant_at(size_t index)
Return the data of the input iterators at the given index at the current locus.
Definition: variant_parallel_input_iterator.hpp:323
variant.hpp
genesis::population::VariantParallelInputIterator::~VariantParallelInputIterator
~VariantParallelInputIterator()=default
genesis::population::VariantInputIterator
utils::LambdaIterator< Variant, VariantInputIteratorData > VariantInputIterator
Iterate Variants, using a variety of input file formats.
Definition: variant_input_iterator.hpp:124
genesis::population::VariantParallelInputIterator::add_carrying_loci
self_type & add_carrying_loci(ForwardIterator first, ForwardIterator last)
Add a set of GenomeLoci that are used as carrying loci in the iteration.
Definition: variant_parallel_input_iterator.hpp:684
genesis::population::VariantParallelInputIterator::Iterator::variant_at
utils::Optional< Variant > const & variant_at(size_t index) const
Return the data of the input iterators at the given index at the current locus.
Definition: variant_parallel_input_iterator.hpp:314
genesis::population::VariantParallelInputIterator::add_variant_input_iterator
self_type & add_variant_input_iterator(VariantInputIterator const &input, ContributionType selection)
Add an input to the parallel iterator.
Definition: variant_parallel_input_iterator.hpp:549
genesis::population::VariantParallelInputIterator::Iterator::value_type
std::vector< utils::Optional< Variant > > value_type
Definition: variant_parallel_input_iterator.hpp:214
genesis::population::VariantParallelInputIterator::Iterator::operator*
std::vector< utils::Optional< Variant > > & operator*()
Definition: variant_parallel_input_iterator.hpp:255
optional.hpp