A library for working with phylogenetic and population genetic data.
v0.32.0
genome_window_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_GENOME_WINDOW_STREAM_H_
2 #define GENESIS_POPULATION_WINDOW_GENOME_WINDOW_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
36 
37 #include <cassert>
38 #include <functional>
39 #include <memory>
40 #include <stdexcept>
41 #include <string>
42 #include <type_traits>
43 #include <unordered_set>
44 #include <utility>
45 
46 namespace genesis {
47 namespace population {
48 
49 // =================================================================================================
50 // Genome Window Stream
51 // =================================================================================================
52 
86 template<class InputStreamIterator, class DataType = typename InputStreamIterator::value_type>
87 class GenomeWindowStream final : public BaseWindowStream<
88  InputStreamIterator, DataType, ::genesis::population::WindowView<DataType>
89 >
90 {
91 public:
92 
93  // -------------------------------------------------------------------------
94  // Typedefs and Enums
95  // -------------------------------------------------------------------------
96 
100 
101  // The input types that we take from the underlying stream over genome positions.
102  using InputType = typename InputStreamIterator::value_type;
103  // using Entry = typename Window::Entry;
104 
105  // This class produces an iterator of type WindowView.
106  // That WindowView then iterates over the actual values of the input.
107  using iterator_category = std::input_iterator_tag;
109  using pointer = value_type*;
111  using const_reference = value_type const&;
112 
113  // ======================================================================================
114  // Internal Iterator
115  // ======================================================================================
116 
120  class DerivedIterator final : public BaseWindowStream<
121  InputStreamIterator, DataType, WindowViewType
122  >::BaseIterator
123  {
124  public:
125 
126  // -------------------------------------------------------------------------
127  // Constructors and Rule of Five
128  // -------------------------------------------------------------------------
129 
130  using self_type = typename GenomeWindowStream<
131  InputStreamIterator, DataType
133 
134  // using base_iterator_type = typename base_type::BaseIterator;
135  using base_iterator_type = typename BaseWindowStream<
136  InputStreamIterator, DataType, WindowViewType
138 
139  // using WindowViewType = WindowViewType;
140  // using Window = ::genesis::population::Window<DataType>;
141  // using Entry = typename Window::Entry;
142  using InputType = typename InputStreamIterator::value_type;
143 
144  using iterator_category = std::input_iterator_tag;
146  using pointer = value_type*;
148  using const_reference = value_type const&;
149 
150  private:
151 
152  DerivedIterator() = default;
153 
155  GenomeWindowStream const* parent
156  )
157  : base_iterator_type( parent )
158  , parent_( parent )
159  // , window_( base_iterator_type::current_, base_iterator_type::end_ )
160  // , window_( parent )
161  {
162  // Edge case check. See Base for details.
163  if( ! parent_ ) {
164  return;
165  }
166 
167  // For this particular iterator, where we process the whole genome,
168  // we are always at the "first" and "last" window of a chromosome, in a sense...
169  base_iterator_type::is_first_window_ = true;
170  base_iterator_type::is_last_window_ = true;
171 
172  // Let's get going. For the whole genome case, we only need to do the init once,
173  // and then are done, as the iterator will do the whole thing in one pass, so there
174  // never is a second iteration, and hence, increment is never called.
175  init_whole_genome_();
176  }
177 
178  public:
179 
180  virtual ~DerivedIterator() override = default;
181 
182  DerivedIterator( self_type const& ) = default;
183  DerivedIterator( self_type&& ) = default;
184 
185  DerivedIterator& operator= ( self_type const& ) = default;
186  DerivedIterator& operator= ( self_type&& ) = default;
187 
189 
190  // -------------------------------------------------------------------------
191  // Internal and Virtual Members
192  // -------------------------------------------------------------------------
193 
194  private:
195 
196  void increment_() override final
197  {
198  // Check that we are still good. If not, this function being called is likely a user
199  // error by trying to increment a past-the-end iterator.
200  assert( parent_ );
201 
202  // For whole chromosome, we always reach the end after incrementing,
203  // and don't need to do anything, except for singalling that end.
204  parent_ = nullptr;
205  }
206 
207  void init_whole_genome_()
208  {
209  assert( parent_ );
210 
211  // Need to check whether there is any data at all. If not, we are done here.
212  if( base_iterator_type::current_ == base_iterator_type::end_ ) {
213  parent_ = nullptr;
214  return;
215  }
216 
217  // We need pointer variables to the iterators and other elements,
218  // so that we can capture them in a C++11 lambda... bit cumbersome.
219  // We need to access the underlying iterator through the self type of the class,
220  // see here https://stackoverflow.com/a/28623297/4184258
221  // (we could do this in all other instances as well, but it only matters here).
222  bool is_first = true;
223  auto& cur = self_type::current_;
224  auto& end = self_type::end_;
225  auto const par = parent_;
226  auto chr = parent_->chromosome_function( *base_iterator_type::current_ );
227  auto const seq_dict = parent_->sequence_dict_;
228 
229  // We set the genome window view. We leave the normal properties for chromosome,
230  // and start and end position of the view untouched here at their defaults,
231  // as this special case instead uses the mechanism of WindowView direclty to report
232  // the chromosomes and their lengths as they are encountered here in the stream.
233  // This is becase we do not have a window over a single chromosome here, and hence
234  // need this special case. See WindowView.
235  window_ = WindowViewType();
236  window_.is_whole_genome( true );
237  auto& window = window_;
238 
239  window_.get_element = [
240  is_first, &cur, &end, par, chr, seq_dict, &window
241  ]() mutable -> DataType* {
242  assert( cur != end );
243 
244  // If this is the first call of the function, we are initializing the WindowView
245  // with the current entry of the underlying iterator. If not, we first move to the
246  // next position (if there is any), before getting the data.
247  if( is_first ) {
248  is_first = false;
249  return &*cur;
250  }
251 
252  // Now we are in the case that we want to move to the next position first.
253  auto const old_pos = par->position_function( *cur );
254  ++cur;
255 
256  // Check whether we are done with the chromosome. That's when we want to update
257  // the window chromosome lengths, and check everything related to that.
258  if( cur == end || par->chromosome_function( *cur ) != chr ) {
259 
260  // We now are finished with a chromsome, so we can add its length to the window.
261  // First we get the length, either from the data or from the dict, if given.
262  size_t chr_len = 0;
263  if( seq_dict ) {
264  auto const dict_entry = seq_dict->find( chr );
265  if( dict_entry == seq_dict->end() ) {
266  throw std::invalid_argument(
267  "In GenomeWindowStream: Cannot iterate chromosome \"" + chr +
268  "\", as the provided sequence dictionary or reference genome "
269  "does not contain the chromosome."
270  );
271  }
272  chr_len = dict_entry->length;
273 
274  if( old_pos > chr_len ) {
275  throw std::invalid_argument(
276  "In GenomeWindowStream: Chromosome \"" + chr + "\" has length " +
277  std::to_string( chr_len ) +
278  " in the provided sequence dictionary or reference genome, "
279  "but the input data contains positions up to " +
280  std::to_string( old_pos ) + " for that chromosome."
281  );
282  }
283  } else {
284  chr_len = old_pos;
285  }
286 
287  // Add the chr and its length to the window.
288  if( window.chromosomes().count( chr ) > 0 ) {
289  throw std::runtime_error(
290  "Chromosome " + chr + " occurs multiple times in the input."
291  );
292  }
293  window.chromosomes()[ chr ] = chr_len;
294 
295  // Now check again whether we are done with the data.
296  // If so, nothing else to do here.
297  if( cur == end ) {
298  return nullptr;
299  }
300 
301  // Here, we are not yet at the end of the data, but at a new chromosome.
302  assert( par->chromosome_function( *cur ) != chr );
303  chr = par->chromosome_function( *cur );
304 
305  return &*cur;
306  }
307  assert( cur != end );
308  assert( par->chromosome_function( *cur ) == chr );
309 
310  // Check that it is in the correct order.
311  auto const new_pos = par->position_function( *cur );
312  if( old_pos >= new_pos ) {
313  throw std::runtime_error(
314  "Invalid order on chromosome " + chr + " with position " +
315  std::to_string( old_pos ) + " followed by position " +
316  std::to_string( new_pos )
317  );
318  }
319 
320  return &*cur;
321  };
322  }
323 
324  value_type& get_current_window_() const override final
325  {
326  return const_cast<value_type&>( window_ );
327  }
328 
329  base_type const* get_parent_() const override final
330  {
331  return parent_;
332  }
333 
334  private:
335 
336  // Parent. Needs to live here to have the correct derived type.
337  GenomeWindowStream const* parent_ = nullptr;
338 
339  // Store the iterator for the window.
340  WindowViewType window_;
341 
342  };
343 
344  // ======================================================================================
345  // Main Class
346  // ======================================================================================
347 
348  // -------------------------------------------------------------------------
349  // Constructors and Rule of Five
350  // -------------------------------------------------------------------------
351 
353  InputStreamIterator begin, InputStreamIterator end
354  )
355  : base_type( begin, end )
356  {}
357 
358  virtual ~GenomeWindowStream() override = default;
359 
360  GenomeWindowStream( GenomeWindowStream const& ) = default;
361  GenomeWindowStream( GenomeWindowStream&& ) = default;
362 
363  GenomeWindowStream& operator= ( GenomeWindowStream const& ) = default;
365 
367 
368  // -------------------------------------------------------------------------
369  // Settings
370  // -------------------------------------------------------------------------
371 
375  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict() const
376  {
377  return sequence_dict_;
378  }
379 
390  self_type& sequence_dict( std::shared_ptr<genesis::sequence::SequenceDict> value )
391  {
392  sequence_dict_ = value;
393  return *this;
394  }
395 
396  // -------------------------------------------------------------------------
397  // Virtual Members
398  // -------------------------------------------------------------------------
399 
400 protected:
401 
402  std::unique_ptr<typename base_type::BaseIterator>
403  get_begin_iterator_() override final
404  {
405  // Cannot use make_unique here, as the Iterator constructor is private,
406  // and trying to make make_unique a friend does not seem to be working...
407  return std::unique_ptr<DerivedIterator>( new DerivedIterator( this ));
408  // return utils::make_unique<DerivedIterator>( this );
409  }
410 
411  std::unique_ptr<typename base_type::BaseIterator>
412  get_end_iterator_() override final
413  {
414  return std::unique_ptr<DerivedIterator>( new DerivedIterator( nullptr ));
415  // return utils::make_unique<DerivedIterator>( nullptr );
416  }
417 
418  // -------------------------------------------------------------------------
419  // Data Members
420  // -------------------------------------------------------------------------
421 
422 private:
423 
424  // When iterating the genome, we might want to look up their lengths,
425  // in order to properly set the window start and end. Otherwise we use what's in the data.
426  std::shared_ptr<genesis::sequence::SequenceDict> sequence_dict_;
427 
428 };
429 
430 // =================================================================================================
431 // Make Genome Window View Iterator
432 // =================================================================================================
433 
441 template<class InputStreamIterator, class DataType = typename InputStreamIterator::value_type>
442 GenomeWindowStream<InputStreamIterator, DataType>
444  InputStreamIterator begin, InputStreamIterator end
445 ) {
447 }
448 
463 template<class InputStreamIterator>
464 GenomeWindowStream<InputStreamIterator>
466  InputStreamIterator begin, InputStreamIterator end
467 ) {
468  using DataType = typename InputStreamIterator::value_type;
469 
470  // Set functors.
471  auto it = GenomeWindowStream<InputStreamIterator>( begin, end );
472  it.entry_input_function = []( DataType const& variant ) {
473  return variant;
474  };
475  it.chromosome_function = []( DataType const& variant ) {
476  return variant.chromosome;
477  };
478  it.position_function = []( DataType const& variant ) {
479  return variant.position;
480  };
481 
482  // Set properties and return.
483  return it;
484 }
485 
486 } // namespace population
487 } // namespace genesis
488 
489 #endif // include guard
genesis::population::GenomeWindowStream::DerivedIterator::GenomeWindowStream
friend GenomeWindowStream
Definition: genome_window_stream.hpp:188
genesis::population::GenomeWindowStream::GenomeWindowStream
GenomeWindowStream(InputStreamIterator begin, InputStreamIterator end)
Definition: genome_window_stream.hpp:352
genesis::population::BaseWindowStream
Base class for streams of Windows over the chromosomes of a genome.
Definition: base_window_stream.hpp:119
genesis::population::BaseWindowStream::BaseIterator::iterator_category
std::input_iterator_tag iterator_category
Definition: base_window_stream.hpp:491
base_window_stream.hpp
genesis::population::GenomeWindowStream::iterator_category
std::input_iterator_tag iterator_category
Definition: genome_window_stream.hpp:107
genesis::population::BaseWindowStream::chromosome_function
std::function< std::string(InputType const &)> chromosome_function
Functor that yields the current chromosome, given the input stream data.
Definition: base_window_stream.hpp:152
genesis::population::GenomeWindowStream::DerivedIterator::value_type
WindowViewType value_type
Definition: genome_window_stream.hpp:145
genesis::population::GenomeWindowStream::DerivedIterator
Internal iterator that produces WindowViews.
Definition: genome_window_stream.hpp:120
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type, ::genesis::population::WindowView< typename InputStreamIterator::value_type > >::end
Iterator end()
Definition: base_window_stream.hpp:759
genesis::population::GenomeWindowStream::WindowViewType
::genesis::population::WindowView< DataType > WindowViewType
Definition: genome_window_stream.hpp:97
window_view.hpp
genesis::population::make_default_genome_window_stream
GenomeWindowStream< InputStreamIterator > make_default_genome_window_stream(InputStreamIterator begin, InputStreamIterator end)
Helper function to instantiate a GenomeWindowStream for the whole genome, for a default use case.
Definition: genome_window_stream.hpp:465
genesis::population::BaseWindowStream::BaseIterator::value_type
WindowType value_type
Definition: base_window_stream.hpp:492
genesis::population::GenomeWindowStream::get_end_iterator_
std::unique_ptr< typename base_type::BaseIterator > get_end_iterator_() override final
Get the end iterator.
Definition: genome_window_stream.hpp:412
genesis::population::BaseWindowStream::BaseIterator::const_reference
value_type const & const_reference
Definition: base_window_stream.hpp:495
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type, ::genesis::population::WindowView< typename InputStreamIterator::value_type > >::DataType
typename InputStreamIterator::value_type DataType
Definition: base_window_stream.hpp:128
genesis::population::GenomeWindowStream::sequence_dict
std::shared_ptr< genesis::sequence::SequenceDict > sequence_dict() const
Get the currently set sequence dictionary used for the chromosome lengths.
Definition: genome_window_stream.hpp:375
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::population::GenomeWindowStream::get_begin_iterator_
std::unique_ptr< typename base_type::BaseIterator > get_begin_iterator_() override final
Get the begin iterator.
Definition: genome_window_stream.hpp:403
genesis::population::GenomeWindowStream::~GenomeWindowStream
virtual ~GenomeWindowStream() override=default
genesis::population::GenomeWindowStream::base_type
BaseWindowStream< InputStreamIterator, DataType, WindowViewType > base_type
Definition: genome_window_stream.hpp:99
genesis::population::GenomeWindowStream::DerivedIterator::base_iterator_type
typename BaseWindowStream< InputStreamIterator, DataType, WindowViewType >::BaseIterator base_iterator_type
Definition: genome_window_stream.hpp:137
genesis::population::GenomeWindowStream::DerivedIterator::~DerivedIterator
virtual ~DerivedIterator() override=default
genesis::population::BaseWindow::is_whole_genome
bool is_whole_genome() const
Return if this instance is intended to be used for a whole genome stream.
Definition: base_window.hpp:192
genesis::population::BaseWindowStream::BaseIterator::BaseIterator
BaseIterator()=default
genesis::population::BaseWindowStream::BaseIterator::self_type
typename BaseWindowStream< InputStreamType, DataType, WindowType >::BaseIterator self_type
Definition: base_window_stream.hpp:488
genesis::population::GenomeWindowStream::const_reference
value_type const & const_reference
Definition: genome_window_stream.hpp:111
genesis::population::WindowView< DataType >
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::GenomeWindowStream
Stream for traversing the entire genome as a single window, with an inner WindowView iterator over th...
Definition: genome_window_stream.hpp:87
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type, ::genesis::population::WindowView< typename InputStreamIterator::value_type > >::begin
Iterator begin()
Definition: base_window_stream.hpp:747
genesis::population::GenomeWindowStream::DerivedIterator
friend DerivedIterator
Definition: genome_window_stream.hpp:366
genesis::population::BaseWindowStream::BaseIterator::pointer
value_type * pointer
Definition: base_window_stream.hpp:493
genesis::population::make_genome_window_stream
GenomeWindowStream< InputStreamIterator, DataType > make_genome_window_stream(InputStreamIterator begin, InputStreamIterator end)
Helper function to instantiate a GenomeWindowStream for the whole genome, without the need to specify...
Definition: genome_window_stream.hpp:443
genesis::population::GenomeWindowStream::operator=
GenomeWindowStream & operator=(GenomeWindowStream const &)=default
genesis::population::WindowView::get_element
std::function< Data *()> get_element
Function to read the next element from some input source.
Definition: window_view.hpp:330
genesis::population::GenomeWindowStream::InputType
typename InputStreamIterator::value_type InputType
Definition: genome_window_stream.hpp:102
genesis::population::BaseWindowStream::BaseIterator::reference
value_type & reference
Definition: base_window_stream.hpp:494
genesis::population::BaseWindowStream::BaseIterator::InputType
typename InputStreamType::value_type InputType
Definition: base_window_stream.hpp:489
genesis::population::GenomeWindowStream::sequence_dict
self_type & sequence_dict(std::shared_ptr< genesis::sequence::SequenceDict > value)
Set a sequence dictionary to be used for the chromosome lengths.
Definition: genome_window_stream.hpp:390
genesis::population::GenomeWindowStream::DerivedIterator::operator=
DerivedIterator & operator=(self_type const &)=default