A library for working with phylogenetic and population genetic data.
v0.32.0
region_window_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_REGION_WINDOW_STREAM_H_
2 #define GENESIS_POPULATION_WINDOW_REGION_WINDOW_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
42 
43 #include <cassert>
44 #include <deque>
45 #include <functional>
46 #include <memory>
47 #include <stdexcept>
48 #include <string>
49 #include <type_traits>
50 #include <unordered_set>
51 #include <utility>
52 #include <vector>
53 
54 namespace genesis {
55 namespace population {
56 
57 // =================================================================================================
58 // Region Window Stream
59 // =================================================================================================
60 
88 template<class InputStreamIterator, class DataType = typename InputStreamIterator::value_type>
89 class RegionWindowStream final : public BaseWindowStream<InputStreamIterator, DataType>
90 {
91 public:
92 
93  // -------------------------------------------------------------------------
94  // Typedefs and Enums
95  // -------------------------------------------------------------------------
96 
99 
101  using Entry = typename Window::Entry;
102  using InputType = typename InputStreamIterator::value_type;
103 
104  using iterator_category = std::input_iterator_tag;
106  using pointer = value_type*;
108  using const_reference = value_type const&;
109 
110  // ======================================================================================
111  // Internal Iterator
112  // ======================================================================================
113 
117  class DerivedIterator final : public BaseWindowStream<InputStreamIterator, DataType>::BaseIterator
118  {
119  public:
120 
121  // -------------------------------------------------------------------------
122  // Constructors and Rule of Five
123  // -------------------------------------------------------------------------
124 
125  using self_type = typename RegionWindowStream<
126  InputStreamIterator, DataType
128 
129  using base_iterator_type = typename BaseWindowStream<
130  InputStreamIterator, DataType
132 
134  using Entry = typename Window::Entry;
135  using InputType = typename InputStreamIterator::value_type;
136 
137  using iterator_category = std::input_iterator_tag;
139  using pointer = value_type*;
141  using const_reference = value_type const&;
142 
143  private:
144 
145  DerivedIterator() = default;
146 
148  RegionWindowStream const* parent
149  )
150  : base_iterator_type( parent )
151  , parent_( parent )
152  {
153  // Edge case check. See Base for details.
154  if( ! parent_ ) {
155  return;
156  }
157  assert( parent_ );
158 
159  // Saveguard. This might be called on an empty range, in which case we just do nothing,
160  // with the exception being that we do not want to skip empty regions; in that case,
161  // we continue, so that - despite there being no data - all regions are added and
162  // processed by the iterator.
163  if(
164  base_iterator_type::current_ == base_iterator_type::end_ &&
165  parent_->skip_empty_regions_
166  ) {
167  parent_ = nullptr;
168  return;
169  }
170 
171  // Let's get going. This takes care of finding the first window that we want to emit.
172  find_next_chromosome_regions_();
173  fill_next_windows_();
174 
175  // We here need a special case for the first/last window markers,
176  // to cover the edge case that there is only a single window.
177  base_iterator_type::is_first_window_ = ( windows_.size() > 0 );
178  base_iterator_type::is_last_window_ = (
179  windows_.size() == 1 ||
180  ( windows_.size() > 1 && windows_[0].chromosome() != windows_[1].chromosome() )
181  );
182 
183  // We might have finished already, if there were no windows at all that we needed to fill.
184  // Could be that we did not have any regions, or skip empty regions, and there was no
185  // overlap with any data. Either way, in that case we have processed all our input
186  // already, and are done.
187  if( windows_.empty() ) {
188  assert( base_iterator_type::current_ == base_iterator_type::end_ );
189  parent_ = nullptr;
190  }
191  }
192 
193  public:
194 
195  virtual ~DerivedIterator() override = default;
196 
197  DerivedIterator( self_type const& ) = default;
198  DerivedIterator( self_type&& ) = default;
199 
200  DerivedIterator& operator= ( self_type const& ) = default;
201  DerivedIterator& operator= ( self_type&& ) = default;
202 
204 
205  // -------------------------------------------------------------------------
206  // Internal and Virtual Members
207  // -------------------------------------------------------------------------
208 
209  private:
210 
214  void increment_() override final
215  {
216  // Check that there is at least one window, which needs to be the case,
217  // as otherwise we would already have finished the iteration, or the user has called
218  // an increment on a past-the-end iterator, so we can asserted that as well here.
219  // For the same reason, we do not have marked this as the end yet, assert that as well.
220  assert( windows_.size() > 0 );
221  assert( parent_ );
222  assert( parent_->region_list_ );
223 
224  // Here we fill at least the next two windows (if possible), to know when we switch
225  // chromosomes, so that we can process our chromosome boundary first/last markers.
226  // In other words, whenever we use fill_next_windows_(), we end up with two or more
227  // windows in the queue while possible, and only end up with 1 window at the very end.
228  if( windows_.size() == 1 ) {
229  // We reached the end. Make sure that we are actually at the end,
230  // and did not forget to mark the window that we just left as the last.
231  // We also either have processed all regions in the list, or wanted to skip
232  // the empty ones anyway (which then are the ones for which there was no data).
233  // We don't really need to pop that last window, but seems cleaner that way.
234  assert( base_iterator_type::current_ == base_iterator_type::end_ );
235  assert( base_iterator_type::is_last_window_ );
236  assert(
237  seen_chromosomes_.size() == parent_->region_list_->chromosome_count() ||
238  parent_->skip_empty_regions_
239  );
240  windows_.pop_front();
241  parent_ = nullptr;
242 
243  } else {
244  assert( windows_.size() > 1 );
245 
246  // If we are about to enter a new chromosome, mark this.
247  // We are going to pop window 0 next, so then what is now window 1 becomes
248  // the current one, which in this case is the first of its chromosome.
249  base_iterator_type::is_first_window_ = (
250  windows_[0].chromosome() != windows_[1].chromosome()
251  );
252 
253  // We go to the next window, meaning we are done with this one,
254  // and fill enough to know where we are at - whether the next one is the last window
255  // of the chromosome or not.
256  windows_.pop_front();
257  fill_next_windows_();
258 
259  // Same as above, but for the end marker. This happens if either we are at the very
260  // end and there is only one window left, or if with the next increment of the
261  // iterator, we are going to have a new chromosome.
262  assert( windows_.size() > 0 );
263  base_iterator_type::is_last_window_ = (
264  windows_.size() == 1 ||
265  ( windows_[0].chromosome() != windows_[1].chromosome() )
266  );
267 
268  // We either have data in the window, or do not skip empty windows.
269  assert( ! windows_.front().empty() || ! parent_->skip_empty_regions_ );
270  }
271  }
272 
282  void fill_next_windows_()
283  {
284  // Shorthands and same basic assertions as before.
285  assert( parent_ );
286  assert( parent_->region_list_ );
287  auto const& region_list = *parent_->region_list_;
288  auto& cur_it = base_iterator_type::current_;
289  auto& end_it = base_iterator_type::end_;
290 
291  // We always need to have at least one window completely ready (filled with data),
292  // as this is the one that the iterator is currently at when it gets dereferenced.
293  // However, we also need to peek ahead to see if there is at least one more region
294  // down the line for which we have data, so that (depending on skip_empty_regions_) we
295  // can tell whether we are currently at the last window of the chromosome or not.
296 
297  // Fill all regions that overlap with our current pos (as we are only processing
298  // the input data once, so we have to do this at the same time). We stop once we have
299  // filled our current window _and_ determined whether it's the last of the chromosome.
300  // We might also reach the end of the data, or the next chromosome while looping.
301  while( cur_it != end_it ) {
302 
303  // Each loop iteration here processes one position of the input data stream.
304  // We need the chr and pos often, so let's store them here.
305  auto const cur_chr = parent_->chromosome_function( *cur_it );
306  auto const cur_pos = parent_->position_function( *cur_it );
307 
308  // We cannot assume to have any windows here... All might have been deleted
309  // in a previous iteration, when the position on the chromosome is after
310  // all regions in the list. So, no assertion about windows here.
311 
312  // The current window is done when we are either already at the next chromosome,
313  // or, if we are still on the same chromosome, we have processed all data in
314  // the interval of the window.
315  auto const cur_win_done = (
316  windows_.size() > 0 && (
317  cur_chr != windows_.front().chromosome() ||
318  cur_pos > windows_.front().last_position()
319  )
320  );
321 
322  // The next window is determined when we know whether it has data or won't be
323  // skipped anyway (if we dont skip_empty_regions_). It does not matter if this
324  // window is on the same chromosome as the current one - we just need to know
325  // that there is one more window coming that we want to yield to the user of this
326  // iterator, and from there we can determine the rest in increment_().
327  auto const next_win_determined = (
328  windows_.size() > 1 && (
329  ! windows_[1].empty() ||
330  ! parent_->skip_empty_regions_
331  )
332  );
333 
334  // Our actual exit condition: We have filled the current window, _and_ we know
335  // whether we are currently at the last window of the chromosome or not.
336  // It might happen that we never break out of here, and instead the loop exit
337  // condition kicks in, which is when we have reached the end of the data.
338  if( cur_win_done && next_win_determined ) {
339  break;
340  }
341 
342  // If we reach the end of the current chromosome, we need to find the next one
343  // that we want to use. There might be chromosomes in the data with no regions,
344  // and we might even reach the end of the input in that case.
345  // This means that we process windows in the same order of the data input stream,
346  // but independently of their order in the genome region list.
347  // The edge case that the windows are empty can occur when the first data entry
348  // on a chr is after the last region, so that all regions are deleted by
349  // fill_windows_at_current_position_() already in the previous loop iteration.
350  if( windows_.empty() || cur_chr != windows_.back().chromosome() ) {
351  // Let's find the next chromosome for which we have regions, and skip all data
352  // that does not have any regions in the list. Once we have found this, we add
353  // all its regions to our window list. This might keep quite a long list of
354  // empty windows in the queue, but that's okay, as this is waaaay easier to
355  // implement than trying to iterate through the regions at the same time as well.
356  find_next_chromosome_regions_();
357 
358  // After the above call, our position in the input stream has changed, so we
359  // need to restart the loop. As cur_it is a reference, it is also updated,
360  // so this is okay to check it here this way.
361  if( cur_it == end_it ) {
362  // If we reached the end of the data, this loop has done its purpose.
363  // (We could just continue here, as this is also the loop exit condition...)
364  break;
365  } else {
366  // If we are not yet at the end, we have moved our cur_it to a new chromosome.
367  // Assert that this was successfull. Then, start the loop again, to reset
368  // our shorthands, and check our exit conditions again for the cur it.
369  assert( ! windows_.empty() );
370  assert(
371  windows_.back().chromosome() == parent_->chromosome_function( *cur_it )
372  );
373  assert( cur_index_ == 0 );
374  continue;
375  }
376  }
377 
378  // Now add all data of the current position to all windows that span the locus.
379  // Let's keep the function a bit smaller, and outsource this step.
380  // We here assert that we are actually where we think we are, as the condition
381  // above might have changed our position on the input data.
382  assert( ! windows_.empty() );
383  assert( parent_->chromosome_function( *cur_it ) == cur_chr );
384  assert( parent_->position_function( *cur_it ) == cur_pos );
385  fill_windows_at_current_position_( cur_chr, cur_pos );
386 
387  // Edge case: The filling of the windows has determined that all windows (or all
388  // but one) were empty and to be deleted, so that none (or just one) are left.
389  // So now we are at a point in the input data where we know that no regions will
390  // come any more on that chromosome. That happens if at the end of chromosomes
391  // there is data, but no regions.
392  // Edge cases covered below:
393  // (1) Every window was removed by the above function, as none had data. This
394  // happens when in the beginning of the iteration, all regions on a chromosome
395  // get deleted, as none contain data.
396  // (2) Same case, but after we already have processed one with data and are now
397  // looking for another region on the same chr, but don't find data for it.
398  // (3) We already moved to the next chr in the data, but while trying to find
399  // a region with data, we also deleted all regions on there, because none
400  // had data. We need to cover this case here, as otherwise we'd be attempting
401  // to add the regions of the chr again by calling find_next_chromosome_regions_()
402  // which we do not want, so we skip the rest of the chr in the data.
403  // In all those cases, we kinda already know that the current window (if there is one)
404  // is the last on the chromosome, but our surrounding functions assume that we have
405  // two windows in the queue at all times (while possible), so we need to move to the
406  // next chromosome here, so that we can get it's regions added to the queue.
407  // Hence, we simply finish the data input for that chromosome.
408  if(
409  windows_.empty() ||
410  (
411  windows_.size() == 1 &&
412  cur_chr == windows_.front().chromosome() &&
413  cur_pos > windows_.front().last_position()
414  ) || (
415  windows_.size() == 1 &&
416  cur_chr != windows_.front().chromosome()
417  )
418  ) {
419  while( cur_it != end_it && parent_->chromosome_function( *cur_it ) == cur_chr ) {
420  ++cur_it;
421  }
422  continue;
423  }
424  assert( cur_it != end_it );
425  assert(
426  windows_.size() > 1 || (
427  windows_.size() == 1 &&
428  cur_chr == windows_.front().chromosome() &&
429  cur_pos <= windows_.front().last_position()
430  )
431  );
432 
433  // Move to next input.
434  ++cur_index_;
435  ++cur_it;
436 
437  // Make sure that input is at least sorted by position.
438  if(
439  cur_it != end_it &&
440  parent_->chromosome_function( *cur_it ) == cur_chr &&
441  parent_->position_function( *cur_it ) <= cur_pos
442  ) {
443  throw std::runtime_error(
444  "Input not sorted or containing duplicate positions,"
445  " on chromosome '" + cur_chr + "',"
446  " position " + std::to_string( parent_->position_function( *cur_it )) +
447  " found in the input after already having seen position " +
448  std::to_string( cur_pos )
449  );
450  }
451  }
452 
453  // Opposite edge case of above: We reached the end of the data stream before
454  // all positions for which we have regions were covered by the above loop.
455  // In that case, the deletion in fill_windows_at_current_position_() was never executed.
456  // That means, we might have regions for which there is no data. Might need to delete.
457  if( cur_it == end_it && parent_->skip_empty_regions_ ) {
458 
459  // Delete all empty windows from the back of the list for which we have not
460  // seen data, so that we did not discard them yet.
461  while( ! windows_.empty() && windows_.back().empty() ) {
462  windows_.pop_back();
463  }
464  }
465 
466  // Another edge case: We have reached the end of the data, but did not process all
467  // regions yet in the case that we don't want to skip the empty ones. In that case,
468  // we just all add them to the end of the list, so that they get processed.
469  // This needs to come here at the end, as we might get here after the above call to
470  // find_next_chromosome_regions_()
471  if(
472  cur_it == end_it &&
473  ! parent_->skip_empty_regions_ &&
474  seen_chromosomes_.size() != region_list.chromosome_count()
475  ) {
476  // It would be a bit more memory efficient to only add one of the remaning chrs
477  // here at a time. However, there might be chrs with exactly one region in the list,
478  // in which case our increment_() function would assume that this is the last
479  // window of the whole iteration... So instead of adding more bookkeeping here,
480  // we simply add all missing remaining regions now, and are done with it.
481  for( auto const& chr : region_list.chromosome_names() ) {
482  assert( region_list.region_count( chr ) > 0 );
483  if( seen_chromosomes_.count( chr ) == 0 ) {
484  add_chromosome_windows_( chr );
485  }
486  }
487  }
488  }
489 
497  void fill_windows_at_current_position_(
498  std::string const& cur_chr,
499  size_t cur_pos
500  ) {
501  auto& cur_it = base_iterator_type::current_;
502  auto& end_it = base_iterator_type::end_;
503  (void) end_it;
504 
505  assert( parent_ );
506  assert( cur_it != end_it );
507  assert( ! cur_chr.empty() );
508  assert( cur_pos > 0 );
509  assert( seen_chromosomes_.count( cur_chr ) > 0 );
510  assert( windows_.size() > 0 );
511 
512  // Add the current locus to all windows that span it. We only visit each locus
513  // once, so we need to make sure that all windows that span the locus get the data.
514  // We use a while loop here, as we might delete empty windows that did not receive
515  // any data while in the loop, and hence want some control over the iteration.
516  size_t i = 0;
517  while( i < windows_.size() ) {
518  auto& window = windows_[i];
519 
520  // The windows are sorted by start position, and so is the input data stream,
521  // so as soon as we reach one where we are not yet at,
522  // we can stop - none of the following windows in this loop need the current pos.
523  if( cur_chr == window.chromosome() && cur_pos < window.first_position() ) {
524  break;
525  }
526 
527  // If the window spans the current locus, add it.
528  if(
529  cur_chr == window.chromosome() &&
530  cur_pos >= window.first_position() &&
531  cur_pos <= window.last_position()
532  ) {
533 
534  // Make absolutely sure that we are adding the right stuff.
535  assert( cur_it != end_it );
536  assert( cur_chr == parent_->chromosome_function( *cur_it ));
537  assert( cur_pos == parent_->position_function( *cur_it ) );
538 
539  window.entries().emplace_back(
540  cur_index_,
541  cur_pos,
542  parent_->entry_input_function( *cur_it )
543  );
544  }
545 
546  // Finally, if we are past a window (either past is position, or on a different
547  // chromosome altogether), and that window is empty, we might want to delete it.
548  // We do not increment i in this case, as after earsing,
549  // the next window will have the current i.
550  if(
551  window.empty() &&
552  parent_->skip_empty_regions_ &&
553  ( cur_chr != window.chromosome() || cur_pos > window.last_position() )
554  ) {
555  windows_.erase( windows_.begin() + i );
556  continue;
557  }
558 
559  // If we are here, we filled the current position into the current window of
560  // the inner loop, or accept emmpty windows, and can continue with the next one.
561  assert( ! window.empty() || ! parent_->skip_empty_regions_ );
562  ++i;
563  }
564  }
565 
576  void find_next_chromosome_regions_()
577  {
578  // Checks and shorthands.
579  assert( parent_ );
580  assert( parent_->region_list_ );
581  auto const& region_list = *parent_->region_list_;
582  auto& cur_it = base_iterator_type::current_;
583  auto& end_it = base_iterator_type::end_;
584 
585  // The function is only called when we have data, or in the edge case that we have no
586  // data but don't want to skip empty regions, in which case there should be no windows.
587  assert( cur_it != end_it || windows_.size() == 0 );
588 
589  // Find the next chromosome for which the region list has regions.
590  // We use empty cur_chr to mark that we have not found a good chromosome yet.
591  std::string cur_chr = "";
592  while( cur_chr.empty() && cur_it != end_it ) {
593 
594  // Get the new chromosome that we are at now, and check that we have not seen it yet.
595  cur_chr = parent_->chromosome_function( *cur_it );
596  if( seen_chromosomes_.count( cur_chr ) > 0 ) {
597  throw std::runtime_error(
598  "Input not sorted, chromosome '" + cur_chr + "' has been in the input before."
599  );
600  }
601 
602  // If the chromosome does not have any regions in the list, we skip the whole
603  // input data for this chromosome, and move on to the next.
604  if(
605  ! region_list.has_chromosome( cur_chr ) ||
606  region_list.region_count( cur_chr ) == 0
607  ) {
608  while( cur_it != end_it && parent_->chromosome_function( *cur_it ) == cur_chr ) {
609  ++cur_it;
610  }
611  // Not adding the chr to the seen list here, as we want that list to only
612  // contain chrs from the region list, but not also the ones frmo the data.
613  // We might add later a check for data chrs as well, let's see.
614  // seen_chromosomes_.insert( cur_chr );
615  cur_chr = "";
616  }
617  }
618 
619  // We might have reached the end of the input data, with nothing left to do here.
620  // At this point, we do not know anything about the windows. We might have n windows
621  // from the prev chr for which we are trying to find data, but there is none, hence we
622  // ended up in this function here.
623  if( cur_it == end_it ) {
624  assert( cur_chr.empty() );
625  return;
626  }
627 
628  // Now we are at a chromosome for which there is data, and regions in the list.
629  // That does not yet mean that there is an overlap between the data and the regions,
630  // but that will be checked by fill_next_windows_().
631  assert( cur_it != end_it );
632  assert( cur_chr == parent_->chromosome_function( *cur_it ) );
633  assert( region_list.has_chromosome( cur_chr ));
634  assert( region_list.region_count( cur_chr ) > 0 );
635 
636  // Add those windows to the list. There needs to have been something added then.
637  add_chromosome_windows_( cur_chr );
638  assert( windows_.size() > 0 );
639 
640  // Reset the index for this chromosome.
641  cur_index_ = 0;
642  }
643 
648  void add_chromosome_windows_( std::string const& chromosome )
649  {
650  // Checks and shorthands.
651  assert( parent_ );
652  assert( parent_->region_list_ );
653  auto const& region_list = *parent_->region_list_;
654 
655  // We only call this function for chromosomes for which there are regions,
656  // and that we have not processed before. Both are ensured by the callers already.
657  assert( region_list.has_chromosome( chromosome ));
658  assert( region_list.region_count( chromosome ) > 0 );
659  assert( seen_chromosomes_.count( chromosome ) == 0 );
660 
661  // For simplicity, we add _all_ regions of the region list of this chromosome to our
662  // window list here already. We could be slightly more efficient, and only ever add
663  // them based on the position that we are at, but we add them empty, and fill in the
664  // data while we iterate the underlying data stream, so it's not much overhead,
665  // but makes the code here way simpler, as we do not have to iterate in parallel over
666  // the data _and_ the regions.
667  for( auto const& region : region_list.chromosome_regions( chromosome ) ) {
668  if( region.low() == 0 || region.high() == 0 ) {
669  throw std::runtime_error(
670  "Cannot process whole chromosomes with RegionWindowStream"
671  );
672  };
673 
674  // Check that they come in the right order, sorted by starting position,
675  // which should be guaranteed as IntervalTree is sorted that way.
676  assert(
677  windows_.empty() ||
678  windows_.back().chromosome() != chromosome ||
679  region.low() >= windows_.back().first_position()
680  );
681 
682  // Now add the window, with all its properties.
683  windows_.emplace_back();
684  auto& window = windows_.back();
685  window.chromosome( chromosome );
686  window.first_position( region.low() );
687  window.last_position( region.high() );
688  }
689 
690  // Mark that we have seen this chromosome and processed its windows.
691  seen_chromosomes_.insert( chromosome );
692  }
693 
694  value_type& get_current_window_() const override final
695  {
696  // If the window list is empty, we have reached the end of the iteration. Calling this
697  // function here is then a user error. Catch it a bit nicer with the assertion.
698  assert( ! windows_.empty() );
699  return const_cast<value_type&>( windows_.front() );
700  }
701 
702  BaseWindowStream<InputStreamIterator, DataType> const* get_parent_() const override final
703  {
704  return parent_;
705  }
706 
707  private:
708 
709  // Parent. Needs to live here to have the correct derived type.
710  RegionWindowStream const* parent_ = nullptr;
711 
712  // We store the chromosome names from the region list that we already processed,
713  // as a double check so that we don't start multiple times, but allow chromosomes
714  // in any order, and only expect positions within a chromosome to be sorted.
715  std::unordered_set<std::string> seen_chromosomes_;
716 
717  // Store the windows that we are filling. This stores all windows of a chromosome,
718  // and gets cleared and filled with new windows when the underlying iterator reaches
719  // the next chromosome.
720  std::deque<Window> windows_;
721  size_t cur_index_ = 0;
722  };
723 
724  // ======================================================================================
725  // Main Class
726  // ======================================================================================
727 
728  // -------------------------------------------------------------------------
729  // Constructors and Rule of Five
730  // -------------------------------------------------------------------------
731 
733  InputStreamIterator begin, InputStreamIterator end,
734  std::shared_ptr<GenomeRegionList> region_list
735  )
736  : base_type( begin, end )
737  , region_list_( region_list )
738  {
739  if( ! region_list ) {
740  throw std::invalid_argument(
741  "No GenomeRegionList provided for creating RegionWindowStream"
742  );
743  }
744  }
745 
746  virtual ~RegionWindowStream() override = default;
747 
748  RegionWindowStream( RegionWindowStream const& ) = default;
749  RegionWindowStream( RegionWindowStream&& ) = default;
750 
751  RegionWindowStream& operator= ( RegionWindowStream const& ) = default;
753 
755 
756  // -------------------------------------------------------------------------
757  // Settings
758  // -------------------------------------------------------------------------
759 
761  {
762  skip_empty_regions_ = value;
763  return *this;
764  }
765 
766  bool skip_empty_regions() const
767  {
768  return skip_empty_regions_;
769  }
770 
771  // -------------------------------------------------------------------------
772  // Virtual Members
773  // -------------------------------------------------------------------------
774 
775 protected:
776 
777  std::unique_ptr<typename BaseWindowStream<InputStreamIterator, DataType>::BaseIterator>
778  get_begin_iterator_() override final
779  {
780  // Cannot use make_unique here, as the Iterator constructor is private,
781  // and trying to make make_unique a friend does not seem to be working...
782  return std::unique_ptr<DerivedIterator>( new DerivedIterator( this ));
783  // return utils::make_unique<DerivedIterator>( this );
784  }
785 
786  std::unique_ptr<typename BaseWindowStream<InputStreamIterator, DataType>::BaseIterator>
787  get_end_iterator_() override final
788  {
789  return std::unique_ptr<DerivedIterator>( new DerivedIterator() );
790  // return utils::make_unique<DerivedIterator>( nullptr );
791  }
792 
793  // -------------------------------------------------------------------------
794  // Data Members
795  // -------------------------------------------------------------------------
796 
797 private:
798 
799  // List of regions that we want to iterate over; each region will be represented as a Window.
800  std::shared_ptr<GenomeRegionList> region_list_;
801 
802  // Settings
803  bool skip_empty_regions_ = true;
804 
805 };
806 
807 // =================================================================================================
808 // Make Region Window Stream
809 // =================================================================================================
810 
820 template<class InputStreamIterator, class DataType = typename InputStreamIterator::value_type>
821 RegionWindowStream<InputStreamIterator, DataType>
823  InputStreamIterator begin, InputStreamIterator end,
824  std::shared_ptr<GenomeRegionList> region_list
825 ) {
826  auto it = RegionWindowStream<InputStreamIterator, DataType>( begin, end, region_list );
827  return it;
828 }
829 
840 template<class InputStreamIterator>
841 RegionWindowStream<InputStreamIterator>
843  InputStreamIterator begin, InputStreamIterator end,
844  std::shared_ptr<GenomeRegionList> region_list
845 ) {
846  using DataType = typename InputStreamIterator::value_type;
847 
848  // Set functors.
849  auto it = RegionWindowStream<InputStreamIterator>( begin, end, region_list );
850  it.entry_input_function = []( DataType const& variant ) {
851  return variant;
852  };
853  it.chromosome_function = []( DataType const& variant ) {
854  return variant.chromosome;
855  };
856  it.position_function = []( DataType const& variant ) {
857  return variant.position;
858  };
859 
860  // Set properties.
861  return it;
862 }
863 
875 template<class InputStreamIterator>
876 WindowViewStream<InputStreamIterator>
878  InputStreamIterator begin, InputStreamIterator end,
879  std::shared_ptr<GenomeRegionList> region_list
880 ) {
882  make_default_region_window_stream( begin, end, region_list )
883  );
884 }
885 
886 } // namespace population
887 } // namespace genesis
888 
889 #endif // include guard
genesis::population::RegionWindowStream::DerivedIterator::operator=
DerivedIterator & operator=(self_type const &)=default
genesis::population::BaseWindowStream
Base class for streams of Windows over the chromosomes of a genome.
Definition: base_window_stream.hpp:119
genesis::population::BaseWindowStream::BaseIterator::iterator_category
std::input_iterator_tag iterator_category
Definition: base_window_stream.hpp:491
genesis::population::RegionWindowStream::operator=
RegionWindowStream & operator=(RegionWindowStream const &)=default
base_window_stream.hpp
genesis::population::BaseWindowStream::chromosome_function
std::function< std::string(InputType const &)> chromosome_function
Functor that yields the current chromosome, given the input stream data.
Definition: base_window_stream.hpp:152
genesis::population::RegionWindowStream::RegionWindowStream
RegionWindowStream(InputStreamIterator begin, InputStreamIterator end, std::shared_ptr< GenomeRegionList > region_list)
Definition: region_window_stream.hpp:732
genesis::population::make_default_region_window_view_stream
WindowViewStream< InputStreamIterator > make_default_region_window_view_stream(InputStreamIterator begin, InputStreamIterator end, std::shared_ptr< GenomeRegionList > region_list)
Helper class that creates a RegionWindowStream and wraps it in a WindowViewStream.
Definition: region_window_stream.hpp:877
genome_locus.hpp
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type >::end
Iterator end()
Definition: base_window_stream.hpp:759
genesis::population::Window< DataType >
genesis::population::RegionWindowStream::const_reference
value_type const & const_reference
Definition: region_window_stream.hpp:108
genesis::population::RegionWindowStream::DerivedIterator
Internal iterator that produces Windows.
Definition: region_window_stream.hpp:117
genesis::population::make_region_window_stream
RegionWindowStream< InputStreamIterator, DataType > make_region_window_stream(InputStreamIterator begin, InputStreamIterator end, std::shared_ptr< GenomeRegionList > region_list)
Helper function to instantiate a RegionWindowStream without the need to specify the template paramete...
Definition: region_window_stream.hpp:822
genesis::population::RegionWindowStream::DerivedIterator::value_type
Window value_type
Definition: region_window_stream.hpp:138
window_view.hpp
genesis::population::make_default_region_window_stream
RegionWindowStream< InputStreamIterator > make_default_region_window_stream(InputStreamIterator begin, InputStreamIterator end, std::shared_ptr< GenomeRegionList > region_list)
Helper function to instantiate a RegionWindowStream for a default use case.
Definition: region_window_stream.hpp:842
genesis::population::RegionWindowStream::iterator_category
std::input_iterator_tag iterator_category
Definition: region_window_stream.hpp:104
genesis::population::BaseWindowStream::BaseIterator::value_type
WindowType value_type
Definition: base_window_stream.hpp:492
genesis::population::BaseWindowStream::BaseIterator::const_reference
value_type const & const_reference
Definition: base_window_stream.hpp:495
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type >::DataType
typename InputStreamIterator::value_type DataType
Definition: base_window_stream.hpp:128
genesis::population::RegionWindowStream::DerivedIterator
friend DerivedIterator
Definition: region_window_stream.hpp:754
genesis::population::RegionWindowStream::DerivedIterator::~DerivedIterator
virtual ~DerivedIterator() override=default
genesis::population::RegionWindowStream::InputType
typename InputStreamIterator::value_type InputType
Definition: region_window_stream.hpp:102
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::population::RegionWindowStream::DerivedIterator::Entry
typename Window::Entry Entry
Definition: region_window_stream.hpp:134
genome_region.hpp
genesis::population::RegionWindowStream::get_end_iterator_
std::unique_ptr< typename BaseWindowStream< InputStreamIterator, DataType >::BaseIterator > get_end_iterator_() override final
Get the end iterator.
Definition: region_window_stream.hpp:787
range.hpp
genesis::population::BaseWindowStream::position_function
std::function< size_t(InputType const &)> position_function
Functor that yields the current position on the chromosome, given the input stream data.
Definition: base_window_stream.hpp:158
genome_region_list.hpp
genesis::population::RegionWindowStream
Stream for Windows representing regions of a genome.
Definition: region_window_stream.hpp:89
genesis::population::BaseWindowStream::BaseIterator::BaseIterator
BaseIterator()=default
genesis::population::BaseWindowStream::BaseIterator::self_type
typename BaseWindowStream< InputStreamType, DataType, WindowType >::BaseIterator self_type
Definition: base_window_stream.hpp:488
window_view_stream.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::Window::Entry
Data that is stored per entry that was enqueued in a window.
Definition: window.hpp:125
genesis::population::BaseWindowStream< InputStreamIterator, typename InputStreamIterator::value_type >::begin
Iterator begin()
Definition: base_window_stream.hpp:747
genesis::population::RegionWindowStream::DerivedIterator::Window
::genesis::population::Window< DataType > Window
Definition: region_window_stream.hpp:133
window.hpp
genesis::population::BaseWindowStream::entry_input_function
std::function< DataType(InputType const &)> entry_input_function
Functor to convert from the underlying input stream that provides the data to fill the windows to the...
Definition: base_window_stream.hpp:147
genesis::population::make_window_view_stream
WindowViewStream< typename T::InputStreamType, typename T::DataType > make_window_view_stream(T const &window_iterator)
Create a WindowViewStream that iterates some underlying BaseWindowStream.
Definition: window_view_stream.hpp:317
genesis::population::RegionWindowStream::skip_empty_regions
bool skip_empty_regions() const
Definition: region_window_stream.hpp:766
genesis::population::BaseWindowStream::BaseIterator::pointer
value_type * pointer
Definition: base_window_stream.hpp:493
genesis::population::RegionWindowStream::Window
::genesis::population::Window< DataType > Window
Definition: region_window_stream.hpp:100
genesis::population::RegionWindowStream::skip_empty_regions
self_type & skip_empty_regions(bool value)
Definition: region_window_stream.hpp:760
genesis::population::RegionWindowStream::get_begin_iterator_
std::unique_ptr< typename BaseWindowStream< InputStreamIterator, DataType >::BaseIterator > get_begin_iterator_() override final
Get the begin iterator.
Definition: region_window_stream.hpp:778
genesis::population::RegionWindowStream::DerivedIterator::RegionWindowStream
friend RegionWindowStream
Definition: region_window_stream.hpp:203
genesis::population::BaseWindowStream::BaseIterator::reference
value_type & reference
Definition: base_window_stream.hpp:494
genesis::population::RegionWindowStream::DerivedIterator::base_iterator_type
typename BaseWindowStream< InputStreamIterator, DataType >::BaseIterator base_iterator_type
Definition: region_window_stream.hpp:131
genesis::population::RegionWindowStream::~RegionWindowStream
virtual ~RegionWindowStream() override=default
genesis::population::BaseWindowStream::BaseIterator::InputType
typename InputStreamType::value_type InputType
Definition: base_window_stream.hpp:489
genesis::population::RegionWindowStream::Entry
typename Window::Entry Entry
Definition: region_window_stream.hpp:101