A library for working with phylogenetic and population genetic data.
v0.32.0
chromosome_window_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_CHROMOSOME_WINDOW_STREAM_H_
2 #define GENESIS_POPULATION_WINDOW_CHROMOSOME_WINDOW_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
37 
38 #include <cassert>
39 #include <functional>
40 #include <memory>
41 #include <stdexcept>
42 #include <string>
43 #include <type_traits>
44 #include <unordered_set>
45 #include <utility>
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // Chromosome Window Stream
52 // =================================================================================================
53 
89 template<class InputStreamIterator, class DataType = typename InputStreamIterator::value_type>
91  InputStreamIterator, DataType, ::genesis::population::WindowView<DataType>
92 >
93 {
94 public:
95 
96  // -------------------------------------------------------------------------
97  // Typedefs and Enums
98  // -------------------------------------------------------------------------
99 
103 
104  // The input types that we take from the underlying stream over genome positions.
105  using InputType = typename InputStreamIterator::value_type;
106  // using Entry = typename Window::Entry;
107 
108  // This class produces an iterator of type WindowView.
109  // That WindowView then iterates over the actual values of the input.
110  using iterator_category = std::input_iterator_tag;
112  using pointer = value_type*;
114  using const_reference = value_type const&;
115 
116  // ======================================================================================
117  // Internal Iterator
118  // ======================================================================================
119 
123  class DerivedIterator final : public BaseWindowStream<
124  InputStreamIterator, DataType, WindowViewType
125  >::BaseIterator
126  {
127  public:
128 
129  // -------------------------------------------------------------------------
130  // Constructors and Rule of Five
131  // -------------------------------------------------------------------------
132 
133  using self_type = typename ChromosomeWindowStream<
134  InputStreamIterator, DataType
136 
137  // using base_iterator_type = typename base_type::BaseIterator;
138  using base_iterator_type = typename BaseWindowStream<
139  InputStreamIterator, DataType, WindowViewType
141 
142  // using WindowViewType = WindowViewType;
143  // using Window = ::genesis::population::Window<DataType>;
144  // using Entry = typename Window::Entry;
145  using InputType = typename InputStreamIterator::value_type;
146 
147  using iterator_category = std::input_iterator_tag;
149  using pointer = value_type*;
151  using const_reference = value_type const&;
152 
153  private:
154 
155  DerivedIterator() = default;
156 
158  ChromosomeWindowStream const* parent
159  )
160  : base_iterator_type( parent )
161  , parent_( parent )
162  // , window_( base_iterator_type::current_, base_iterator_type::end_ )
163  // , window_( parent )
164  {
165  // Edge case check. See Base for details.
166  if( ! parent_ ) {
167  return;
168  }
169 
170  // For this particular iterator, where we process the whole chromosome or genome,
171  // we are always at the "first" and "last" window of a chromosome, in a sense...
172  base_iterator_type::is_first_window_ = true;
173  base_iterator_type::is_last_window_ = true;
174 
175  // Let's get going.
176  increment_();
177  }
178 
179  public:
180 
181  virtual ~DerivedIterator() override = default;
182 
183  DerivedIterator( self_type const& ) = default;
184  DerivedIterator( self_type&& ) = default;
185 
186  DerivedIterator& operator= ( self_type const& ) = default;
187  DerivedIterator& operator= ( self_type&& ) = default;
188 
190 
191  // -------------------------------------------------------------------------
192  // Internal and Virtual Members
193  // -------------------------------------------------------------------------
194 
195  private:
196 
197  void increment_() override final
198  {
199  // Check that we are still good. If not, this function being called is likely a user
200  // error by trying to increment a past-the-end iterator.
201  assert( parent_ );
202 
203  // Move to the next chromosome. This is only important if this increment function
204  // is called before the inner window view iterator has finished the whole chromosome,
205  // so if for example a break is called within.
206  while(
207  base_iterator_type::current_ != base_iterator_type::end_ &&
208  parent_->chromosome_function( *base_iterator_type::current_ ) == window_.chromosome()
209  ) {
210  ++base_iterator_type::current_;
211  }
212 
213  // Now check whether there is any data left. If not, we are done here.
214  if( base_iterator_type::current_ == base_iterator_type::end_ ) {
215  parent_ = nullptr;
216  return;
217  }
218  assert( parent_ );
219 
220  // Now we know there is still data, but it belongs to a different chromosome.
221  assert( base_iterator_type::current_ != base_iterator_type::end_ );
222  assert(
223  parent_->chromosome_function( *base_iterator_type::current_ ) != window_.chromosome()
224  );
225 
226  // We need pointer variables to the iterators and other elements,
227  // which can be used as copy-inits for the lambda below.
228  // We need to access the underlying iterator through the self type of the class,
229  // see here https://stackoverflow.com/a/28623297/4184258
230  // (we could do this in all other instances as well, but it only matters here).
231  bool is_first = true;
232  auto& cur = self_type::current_;
233  auto& end = self_type::end_;
234  auto const par = parent_;
235  auto const chr = parent_->chromosome_function( *base_iterator_type::current_ );
236  auto const seq_dict = parent_->sequence_dict_;
237 
238  // Check that we do not have invalid data where chromosomes are repeated.
239  if( processed_chromosomes_.count( chr ) > 0 ) {
240  throw std::runtime_error(
241  "Chromosome " + chr + " occurs multiple times in the input."
242  );
243  }
244  processed_chromosomes_.insert( chr );
245 
246  // We reset the window view, so that it's a new iterator for the new chromosome.
247  window_ = WindowViewType();
248  window_.chromosome( chr );
249  if( parent_->sequence_dict_ ) {
250  auto const dict_entry = parent_->sequence_dict_->find( chr );
251  if( dict_entry == parent_->sequence_dict_->end() ) {
252  throw std::invalid_argument(
253  "In ChromosomeWindowStream: Cannot iterate chromosome \"" + chr +
254  "\", as the provided sequence dictionary or reference genome "
255  "does not contain the chromosome."
256  );
257  }
258  window_.first_position( 1 );
259  window_.last_position( dict_entry->length );
260  } else {
261  window_.first_position( 1 );
262  window_.last_position( 1 );
263  }
264  auto& window = window_;
265 
266  // Iterate starting from the first position, with a fitting increment function.
267  window_.get_element = [
268  is_first, &cur, &end, par, chr, seq_dict, &window
269  ]() mutable -> DataType* {
270  // If this is the first call of the function, we are initializing the WindowView
271  // with the current entry of the underlying iterator. If not, we first move to the
272  // next position (if there is any), before getting the data.
273  if( is_first ) {
274  assert( cur != end );
275  is_first = false;
276  return &*cur;
277  }
278 
279  // Now we are in the case that we want to move to the next position first.
280  // Move to the next position.
281  assert( cur != end );
282  auto const old_pos = par->position_function( *cur );
283  ++cur;
284 
285  // Check whether we are done with the chromosome.
286  // If not, we update the last position to be the one that we just found,
287  // and return the current element that we just moved to.
288  if( cur == end || par->chromosome_function( *cur ) != chr ) {
289 
290  // If we reach the end of a chromosome, we check that its length is within
291  // the dict limits, just as a safety measure. Only check once, to avoid
292  // looking up the chromosome in the dict in every iteration.
293  if( seq_dict && old_pos > seq_dict->get( chr ).length ) {
294  throw std::invalid_argument(
295  "In ChromosomeWindowStream: Chromosome \"" + chr + "\" has length " +
296  std::to_string( seq_dict->get( chr ).length ) +
297  " in the provided sequence dictionary or reference genome, "
298  "but the input data contains positions up to " +
299  std::to_string( old_pos ) + " for that chromosome."
300  );
301  }
302 
303  // If we have a ref genome, we use that for the window positions.
304  // If not (here), we use the last position we found in the input.
305  if( ! seq_dict ) {
306  window.last_position( old_pos );
307  }
308 
309  // We are done with the chromosome (or whole input), and signal this via nullptr.
310  return nullptr;
311  }
312  assert( cur != end );
313  assert( par->chromosome_function( *cur ) == chr );
314 
315  // Check that it is in the correct order.
316  auto const new_pos = par->position_function( *cur );
317  if( old_pos >= new_pos ) {
318  throw std::runtime_error(
319  "Invalid order on chromosome " + chr + " with position " +
320  std::to_string( old_pos ) + " followed by position " +
321  std::to_string( new_pos )
322  );
323  }
324 
325  // Return a pointer to the element.
326  return &*cur;
327  };
328  }
329 
330  value_type& get_current_window_() const override final
331  {
332  return const_cast<value_type&>( window_ );
333  }
334 
335  base_type const* get_parent_() const override final
336  {
337  return parent_;
338  }
339 
340  private:
341 
342  // Parent. Needs to live here to have the correct derived type.
343  ChromosomeWindowStream const* parent_ = nullptr;
344 
345  // Store the iterator for the window.
346  WindowViewType window_;
347 
348  // We keep track of which chromosomes we have seen yet, in order to allow random order,
349  // but not repeated chromosomes.
350  std::unordered_set<std::string> processed_chromosomes_;
351 
352  };
353 
354  // ======================================================================================
355  // Main Class
356  // ======================================================================================
357 
358  // -------------------------------------------------------------------------
359  // Constructors and Rule of Five
360  // -------------------------------------------------------------------------
361 
363  InputStreamIterator begin, InputStreamIterator end
364  )
365  : base_type( begin, end )
366  {}
367 
368  virtual ~ChromosomeWindowStream() override = default;
369 
372 
375 
377 
378  // -------------------------------------------------------------------------
379  // Settings
380  // -------------------------------------------------------------------------
381 
385  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict() const
386  {
387  return sequence_dict_;
388  }
389 
401  self_type& sequence_dict( std::shared_ptr<genesis::sequence::SequenceDict> value )
402  {
403  sequence_dict_ = value;
404  return *this;
405  }
406 
407  // -------------------------------------------------------------------------
408  // Virtual Members
409  // -------------------------------------------------------------------------
410 
411 protected:
412 
413  std::unique_ptr<typename base_type::BaseIterator>
414  get_begin_iterator_() override final
415  {
416  // Cannot use make_unique here, as the Iterator constructor is private,
417  // and trying to make make_unique a friend does not seem to be working...
418  return std::unique_ptr<DerivedIterator>( new DerivedIterator( this ));
419  // return utils::make_unique<DerivedIterator>( this );
420  }
421 
422  std::unique_ptr<typename base_type::BaseIterator>
423  get_end_iterator_() override final
424  {
425  return std::unique_ptr<DerivedIterator>( new DerivedIterator( nullptr ));
426  // return utils::make_unique<DerivedIterator>( nullptr );
427  }
428 
429  // -------------------------------------------------------------------------
430  // Data Members
431  // -------------------------------------------------------------------------
432 
433 private:
434 
435  // When iterating chromosomes, we might want to look up their lengths,
436  // in order to properly set the window start and end. Otherwise we use what's in the data.
437  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict_;
438 
439 };
440 
441 // =================================================================================================
442 // Make Chromosome Window View Iterator
443 // =================================================================================================
444 
449 template<class InputStreamIterator, class DataType = typename InputStreamIterator::value_type>
450 ChromosomeWindowStream<InputStreamIterator, DataType>
452  InputStreamIterator begin, InputStreamIterator end
453 ) {
455 }
456 
468 template<class InputStreamIterator>
469 ChromosomeWindowStream<InputStreamIterator>
471  InputStreamIterator begin, InputStreamIterator end
472 ) {
473  using DataType = typename InputStreamIterator::value_type;
474 
475  // Set functors.
476  auto it = ChromosomeWindowStream<InputStreamIterator>( begin, end );
477  it.entry_input_function = []( DataType const& variant ) {
478  return variant;
479  };
480  it.chromosome_function = []( DataType const& variant ) {
481  return variant.chromosome;
482  };
483  it.position_function = []( DataType const& variant ) {
484  return variant.position;
485  };
486 
487  // Set properties and return.
488  return it;
489 }
490 
491 } // namespace population
492 } // namespace genesis
493 
494 #endif // include guard
genesis::population::BaseWindowStream
Base class for streams of Windows over the chromosomes of a genome.
Definition: base_window_stream.hpp:119
genesis::population::BaseWindowStream::BaseIterator::iterator_category
std::input_iterator_tag iterator_category
Definition: base_window_stream.hpp:491
base_window_stream.hpp
genesis::population::BaseWindowStream::chromosome_function
std::function< std::string(InputType const &)> chromosome_function
Functor that yields the current chromosome, given the input stream data.
Definition: base_window_stream.hpp:152
genesis::population::ChromosomeWindowStream::sequence_dict
self_type & sequence_dict(std::shared_ptr< genesis::sequence::SequenceDict > value)
Set a sequence dictionary to be used for the chromosome lengths.
Definition: chromosome_window_stream.hpp:401
genesis::population::ChromosomeWindowStream::get_end_iterator_
std::unique_ptr< typename base_type::BaseIterator > get_end_iterator_() override final
Get the end iterator.
Definition: chromosome_window_stream.hpp:423
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type, ::genesis::population::WindowView< typename InputStreamIterator::value_type > >::end
Iterator end()
Definition: base_window_stream.hpp:759
genesis::population::ChromosomeWindowStream::sequence_dict
std::shared_ptr< genesis::sequence::SequenceDict > sequence_dict() const
Get the currently set sequence dictionary used for the chromosome lengths.
Definition: chromosome_window_stream.hpp:385
genesis::population::ChromosomeWindowStream::DerivedIterator
friend DerivedIterator
Definition: chromosome_window_stream.hpp:376
window_view.hpp
genesis::population::ChromosomeWindowStream::ChromosomeWindowStream
ChromosomeWindowStream(InputStreamIterator begin, InputStreamIterator end)
Definition: chromosome_window_stream.hpp:362
genesis::population::BaseWindowStream::BaseIterator::value_type
WindowType value_type
Definition: base_window_stream.hpp:492
genesis::population::ChromosomeWindowStream::DerivedIterator::operator=
DerivedIterator & operator=(self_type const &)=default
genesis::population::BaseWindowStream::BaseIterator::const_reference
value_type const & const_reference
Definition: base_window_stream.hpp:495
genesis::population::ChromosomeWindowStream::get_begin_iterator_
std::unique_ptr< typename base_type::BaseIterator > get_begin_iterator_() override final
Get the begin iterator.
Definition: chromosome_window_stream.hpp:414
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type, ::genesis::population::WindowView< typename InputStreamIterator::value_type > >::DataType
typename InputStreamIterator::value_type DataType
Definition: base_window_stream.hpp:128
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::population::ChromosomeWindowStream::iterator_category
std::input_iterator_tag iterator_category
Definition: chromosome_window_stream.hpp:110
genesis::population::ChromosomeWindowStream::DerivedIterator::~DerivedIterator
virtual ~DerivedIterator() override=default
genesis::population::ChromosomeWindowStream::DerivedIterator::base_iterator_type
typename BaseWindowStream< InputStreamIterator, DataType, WindowViewType >::BaseIterator base_iterator_type
Definition: chromosome_window_stream.hpp:140
genesis::population::BaseWindow::chromosome
std::string const & chromosome() const
Get the chromosome name that this Window belongs to.
Definition: base_window.hpp:90
genesis::population::ChromosomeWindowStream::base_type
BaseWindowStream< InputStreamIterator, DataType, WindowViewType > base_type
Definition: chromosome_window_stream.hpp:102
sequence_dict.hpp
genesis::population::ChromosomeWindowStream::InputType
typename InputStreamIterator::value_type InputType
Definition: chromosome_window_stream.hpp:105
genesis::population::BaseWindowStream::BaseIterator::BaseIterator
BaseIterator()=default
genesis::population::BaseWindowStream::BaseIterator::self_type
typename BaseWindowStream< InputStreamType, DataType, WindowType >::BaseIterator self_type
Definition: base_window_stream.hpp:488
genesis::population::WindowView< DataType >
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type, ::genesis::population::WindowView< typename InputStreamIterator::value_type > >::begin
Iterator begin()
Definition: base_window_stream.hpp:747
genesis::population::ChromosomeWindowStream::const_reference
value_type const & const_reference
Definition: chromosome_window_stream.hpp:114
genesis::population::ChromosomeWindowStream::DerivedIterator::ChromosomeWindowStream
friend ChromosomeWindowStream
Definition: chromosome_window_stream.hpp:189
genesis::population::ChromosomeWindowStream::WindowViewType
::genesis::population::WindowView< DataType > WindowViewType
Definition: chromosome_window_stream.hpp:100
genesis::population::BaseWindowStream::BaseIterator::pointer
value_type * pointer
Definition: base_window_stream.hpp:493
genesis::population::ChromosomeWindowStream::operator=
ChromosomeWindowStream & operator=(ChromosomeWindowStream const &)=default
genesis::population::BaseWindow::first_position
size_t first_position() const
Get the first position in the chromosome of the Window, that is, where the Window starts.
Definition: base_window.hpp:114
genesis::population::make_chromosome_window_stream
ChromosomeWindowStream< InputStreamIterator, DataType > make_chromosome_window_stream(InputStreamIterator begin, InputStreamIterator end)
Helper function to instantiate a ChromosomeWindowStream for each chromosome, without the need to spec...
Definition: chromosome_window_stream.hpp:451
genesis::population::WindowView::get_element
std::function< Data *()> get_element
Function to read the next element from some input source.
Definition: window_view.hpp:330
genesis::population::ChromosomeWindowStream::~ChromosomeWindowStream
virtual ~ChromosomeWindowStream() override=default
genesis::population::BaseWindow::last_position
size_t last_position() const
Get the last position in the chromosome of the Window, that is, where the Window ends.
Definition: base_window.hpp:134
genesis::population::BaseWindowStream::BaseIterator::reference
value_type & reference
Definition: base_window_stream.hpp:494
genesis::population::make_default_chromosome_window_stream
ChromosomeWindowStream< InputStreamIterator > make_default_chromosome_window_stream(InputStreamIterator begin, InputStreamIterator end)
Helper function to instantiate a ChromosomeWindowStream for each chromosome, for a default use case.
Definition: chromosome_window_stream.hpp:470
genesis::population::ChromosomeWindowStream::DerivedIterator
Internal iterator that produces WindowViews.
Definition: chromosome_window_stream.hpp:123
genesis::population::ChromosomeWindowStream
Stream for traversing each chromosome as a whole, with an inner WindowView iterator over the position...
Definition: chromosome_window_stream.hpp:90
genesis::population::BaseWindowStream::BaseIterator::InputType
typename InputStreamType::value_type InputType
Definition: base_window_stream.hpp:489