A library for working with phylogenetic and population genetic data.
v0.32.0
sliding_window_generator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_SLIDING_WINDOW_GENERATOR_H_
2 #define GENESIS_POPULATION_WINDOW_SLIDING_WINDOW_GENERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #include <cassert>
35 #include <deque>
36 #include <functional>
37 #include <stdexcept>
38 #include <string>
39 #include <vector>
40 
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Sliding Window Type
49 // =================================================================================================
50 
56 {
63  kInterval,
64 
69  kVariants,
70 
79 };
80 
81 // =================================================================================================
82 // Genomic Sliding Window Generator
83 // =================================================================================================
84 
172 template<class D, class A = EmptyAccumulator>
174 {
175 public:
176 
177  // -------------------------------------------------------------------------
178  // Typedefs and Enums
179  // -------------------------------------------------------------------------
180 
181  using Data = D;
182  using Accumulator = A;
184  using Entry = typename Window::Entry;
185 
187 
201  using on_chromosome_start = std::function<void(
202  std::string const& chromosome,
203  typename Window::Accumulator& accumulator
204  )>;
205 
215  using on_chromosome_finish = std::function<void(
216  std::string const& chromosome,
217  typename Window::Accumulator& accumulator
218  )>;
219 
235  using on_enqueue = std::function<void(
236  typename Window::Entry const& entry,
237  typename Window::Accumulator& accumulator
238  )>;
239 
249  using on_dequeue = std::function<void(
250  typename Window::Entry const& entry,
251  typename Window::Accumulator& accumulator
252  )>;
253 
265  using on_emission = std::function<void(
266  Window const& window
267  )>;
268 
269  // -------------------------------------------------------------------------
270  // Constructors and Rule of Five
271  // -------------------------------------------------------------------------
272 
273  // /**
274  // * @brief Create a default (empty) instance.
275  // *
276  // * Not really important for usage, as this cannot do much. Typically, one nees a window width
277  // * and stride for doing any work. But having a default constructor comes in handy at times.
278  // */
279  // SlidingWindowGenerator() = default;
280 
290  : window_type_(type)
291  , width_(width)
292  , stride_(stride)
293  {
294  if( width == 0 ) {
295  throw std::runtime_error( "Cannot use SlidingWindowGenerator of width 0." );
296  }
297  if( stride == 0 ) {
298  stride_ = width;
299  }
300  if( stride_ > width_ ) {
301  throw std::runtime_error( "Cannot use SlidingWindowGenerator with stride > width." );
302  }
303  }
304 
313  {
315  }
316 
319 
322 
323  // -------------------------------------------------------------------------
324  // Settings
325  // -------------------------------------------------------------------------
326 
331  {
332  return window_type_;
333  }
334 
342  size_t width() const
343  {
344  return width_;
345  }
346 
355  size_t stride() const
356  {
357  return stride_;
358  }
359 
366  {
367  return emit_incomplete_windows_;
368  }
369 
377  void emit_incomplete_windows( bool value )
378  {
379  emit_incomplete_windows_ = value;
380  }
381 
382  // /* *
383  // * @brief Get the WindowAnchorType that we use for the emitted Window%s.
384  // */
385  // WindowAnchorType anchor_type() const
386  // {
387  // return window_.anchor_type();
388  // }
389  //
390  // /* *
391  // * @brief Set the WindowAnchorType that we use for the emitted Window%s.
392  // */
393  // void anchor_type( WindowAnchorType value )
394  // {
395  // window_.anchor_type( value );
396  // }
397 
398  // -------------------------------------------------------------------------
399  // Accessors & Modifiers
400  // -------------------------------------------------------------------------
401 
408  std::string const& chromosome() const
409  {
410  // We could keep our own chromosome here, but Window already as a member for this,
411  // so we just re-use.
412  return window_.chromosome();
413  }
414 
421  bool empty() const
422  {
423  return next_index_ == 0;
424  }
425 
432  void clear()
433  {
434  current_start_ = 0;
435  next_index_ = 0;
436  window_.clear();
437  }
438 
439  // -------------------------------------------------------------------------
440  // Event Plugin Functionals
441  // -------------------------------------------------------------------------
442 
447  {
448  chromosome_start_plugins_.push_back(plugin);
449  return *this;
450  }
451 
456  {
457  chromosome_finish_plugins_.push_back(plugin);
458  return *this;
459  }
460 
465  {
466  enqueue_plugins_.push_back(plugin);
467  return *this;
468  }
469 
474  {
475  dequeue_plugins_.push_back(plugin);
476  return *this;
477  }
478 
483  {
484  emission_plugins_.push_back(plugin);
485  return *this;
486  }
487 
494  {
495  chromosome_start_plugins_.clear();
496  chromosome_finish_plugins_.clear();
497  enqueue_plugins_.clear();
498  dequeue_plugins_.clear();
499  emission_plugins_.clear();
500  }
501 
502  // -------------------------------------------------------------------------
503  // Enqueue and Generate Windows
504  // -------------------------------------------------------------------------
505 
512  void start_chromosome( std::string const& chromosome )
513  {
514  if( chromosome != window_.chromosome() ) {
516  window_.chromosome( chromosome );
517  }
518  }
519 
534  void enqueue( std::string const& chromosome, size_t position, Data const& data )
535  {
537  enqueue_( position, Data{ data });
538  }
539 
545  void enqueue( std::string const& chromosome, size_t position, Data&& data )
546  {
548  enqueue_( position, std::move( data ));
549  }
550 
562  void enqueue( size_t position, Data const& data )
563  {
564  enqueue_( position, Data{ data });
565  }
566 
572  void enqueue( size_t position, Data&& data )
573  {
574  enqueue_( position, std::move( data ));
575  }
576 
611  void finish_chromosome( size_t last_position = 0 )
612  {
613  // If nothing was enqueued yet, there is nothing to finish.
614  // This also makes sure that calling this function multiple times in a row does not
615  // have any side effects.
616  if( next_index_ == 0 ) {
617  return;
618  }
619 
620  // If we did not get a specific last position, we just finish the current interval.
621  if( last_position == 0 ) {
622  last_position = current_start_ + width_;
623  }
624 
625  // Boundary check. We make sure that the given position is neither in front of the current
626  // window, or, if there are entries in the list, also not in front of those.
627  // See enqueue_() for details. Same here.
628  size_t cur_end = window_.entries().empty() ? 0 : window_.entries().back().position;
629  cur_end = current_start_ > cur_end ? current_start_ - 1 : cur_end;
630  if( last_position <= cur_end ) {
631  throw std::runtime_error(
632  "Cannot call finish_chromosome() with position " + std::to_string(last_position) +
633  ", as the current window/chromosome is already advanced up to position " +
634  std::to_string( cur_end ) + "."
635  );
636  }
637 
638  // Emit the remaining data entries.
639  if( window_type_ == SlidingWindowType::kInterval ) {
640  synchronize_interval_( last_position );
641  assert( current_start_ <= last_position );
642  assert( last_position < current_start_ + width_ );
643 
644  // Special case for the emit_incomplete_windows_ setting. We have synchronized so that
645  // the given last_position is within the current interval. Now we need to emit that
646  // particular (incomplete) window and clean it up. The last p
647  if( emit_incomplete_windows_ ) {
648  emit_window_( current_start_, last_position + 1, last_position + 1 );
649  }
650 
651  } else if( window_type_ == SlidingWindowType::kVariants ) {
652  throw std::runtime_error( "Not yet implemented." );
653  } else {
654  assert( false );
655  }
656 
657  // Wrap up the chromosome, and clear, so that we can start a new chromosome cleanly.
658  for( auto const& chromosome_finish_plugin : chromosome_finish_plugins_ ) {
659  if( chromosome_finish_plugin ) {
660  chromosome_finish_plugin( window_.chromosome(), window_.accumulator() );
661  }
662  }
663  clear();
664  }
665 
666  // -------------------------------------------------------------------------
667  // General Internal Members
668  // -------------------------------------------------------------------------
669 
670 private:
671 
672  void enqueue_( size_t position, Data&& data )
673  {
674  // If this is the first enqueuing of the window or the chromosome,
675  // we need to call the start plugin.
676  if( next_index_ == 0 ) {
677  for( auto const& chromosome_start_plugin : chromosome_start_plugins_ ) {
678  if( chromosome_start_plugin ) {
679  chromosome_start_plugin( window_.chromosome(), window_.accumulator() );
680  }
681  }
682  }
683 
684  // Boundary check. We make sure that the given position is neither in front of the current
685  // window, or, if there are entries in the list, also not in front of those.
686  // (There might be cases were we are already in the middle of the chromosome, but the
687  // entries list is empty. Not entirely sure when this can occurr, but it feels like it can,
688  // and just checking this doesn't cost us much. If anyone wants to think this through,
689  // feel free.)
690  size_t cur_end = window_.entries().empty() ? 0 : window_.entries().back().position;
691  cur_end = current_start_ > cur_end ? current_start_ - 1 : cur_end;
692  if( position <= cur_end ) {
693  throw std::invalid_argument(
694  "Cannot enqueue at position " + std::to_string( position ) +
695  ", as the current window/chromosome is already advanced up to position " +
696  std::to_string( cur_end ) +
697  ". Either start a new window or a new chromosome within the window. "
698  "Note that this error might be caused by a VCF file that is not sorted by "
699  "chromosome and position."
700  );
701  }
702  assert( position >= current_start_ );
703  assert( window_.entries().empty() || position > window_.entries().back().position );
704 
705  // Do the correct type of enqueuing.
706  if( window_type_ == SlidingWindowType::kInterval ) {
707  enqueue_interval_( position, std::move( data ));
708  } else if( window_type_ == SlidingWindowType::kVariants ) {
709  throw std::runtime_error( "Not yet implemented." );
710  } else {
711  assert( false );
712  }
713  }
714 
715  // -------------------------------------------------------------------------
716  // Interval Internal Members
717  // -------------------------------------------------------------------------
718 
719 private:
720 
724  void enqueue_interval_( size_t position, Data&& data )
725  {
726  assert( window_type_ == SlidingWindowType::kInterval );
727 
728  // Make sure that we move to the interval where our position needs to be added to.
729  synchronize_interval_( position );
730  assert( current_start_ <= position );
731  assert( position < current_start_ + width_ );
732 
733  // Add the new data to our entry queue.
734  window_.entries().emplace_back( next_index_, position, std::move( data ));
735  ++next_index_;
736 
737  // Run the enqueue event plugins. We do not emit anything here. That will be done once
738  // the interval is finished, that is, above, when a new position outside of the interval
739  // is added (or we finish the whole iteration).
740  for( auto const& enqueue_plugin : enqueue_plugins_ ) {
741  if( enqueue_plugin ) {
742  assert( window_.entries().size() > 0 );
743  enqueue_plugin( window_.entries().back(), window_.accumulator() );
744  }
745  }
746 
747  // Make sure that all entries in the queue are within our current bounds,
748  // and are in the correct order. We use a lambda that we immediately call.
749  assert( [&](){
750  size_t cur_pos = 0;
751  for( auto it : window_.entries() ) {
752  if( it.position < current_start_ || it.position >= current_start_ + width_ ) {
753  return false;
754  }
755  if( it.position < cur_pos ) {
756  return false;
757  }
758  cur_pos = it.position;
759  }
760  return true;
761  }() );
762  }
763 
767  void synchronize_interval_( size_t position )
768  {
769  assert( window_type_ == SlidingWindowType::kInterval );
770 
771  // This function is only called internally, and only if we are sure that the position is
772  // valid. Let's assert this.
773  assert( position >= current_start_ );
774  assert( window_.entries().empty() || window_.entries().back().position < position );
775 
776  // Either there are no entries, or they are all within the current interval.
777  // That has to be the case, because we emit if we finish an interval, and remove the data.
778  // So, there should never be data that is from an old interval at this point here.
779  assert(
780  window_.entries().empty() || window_.entries().front().position >= current_start_
781  );
782  assert(
783  window_.entries().empty() || window_.entries().front().position < current_start_ + width_
784  );
785  assert(
786  window_.entries().empty() || window_.entries().back().position >= current_start_
787  );
788  assert(
789  window_.entries().empty() || window_.entries().back().position < current_start_ + width_
790  );
791 
792  // Emit the windows up to the position where we want to enqueue the new data entry.
793  // As we slide over intervals of fixed size along the genome, this can mean that we
794  // have to emit multiple (potentially empty) windows along the way, until we are at the
795  // interval that contains our new position.
796  while( current_start_ + width_ <= position ) {
797 
798  // Emit and move to next interval.
799  emit_window_( current_start_, current_start_ + width_, current_start_ + stride_ );
800  current_start_ += stride_;
801  }
802 
803  // We are now within the exact interval where we need to be.
804  assert( current_start_ <= position );
805  assert( position < current_start_ + width_ );
806  }
807 
811  void emit_window_( size_t first_position, size_t last_position, size_t dequeue_until )
812  {
813  assert( window_type_ == SlidingWindowType::kInterval );
814 
815  // Make sure that all entries in the queue are within our current bounds,
816  // and are in the correct order. We use a lambda that we immediately call.
817  assert( [&](){
818  size_t cur_pos = 0;
819  for( auto const& it : window_.entries() ) {
820  if( it.position < first_position || it.position >= last_position ) {
821  return false;
822  }
823  if( it.position < cur_pos ) {
824  return false;
825  }
826  cur_pos = it.position;
827  }
828  return true;
829  }() );
830 
831  // Prepare the window properties.
832  assert( last_position > first_position );
833  window_.first_position( first_position );
834  window_.last_position( last_position );
835 
836  // Now emit all plugin functions.
837  for( auto const& emission_plugin : emission_plugins_ ) {
838  if( emission_plugin ) {
839  emission_plugin( window_ );
840  }
841  }
842 
843  // Dequeue everything that just moved out of the current interval.
844  while(
845  window_.entries().size() > 0 && window_.entries().front().position < dequeue_until
846  ) {
847  for( auto const& dequeue_plugin : dequeue_plugins_ ) {
848  if( dequeue_plugin ) {
849  dequeue_plugin( window_.entries().front(), window_.accumulator() );
850  }
851  }
852  window_.entries().pop_front();
853  }
854  }
855 
856  // -------------------------------------------------------------------------
857  // Data Members
858  // -------------------------------------------------------------------------
859 
860 private:
861 
862  // Fixed settings
863  SlidingWindowType window_type_;
864  size_t width_;
865  size_t stride_;
866 
867  // Other settings
868  bool emit_incomplete_windows_ = false;
869 
870  // Current window and its position
871  size_t current_start_ = 0;
872  size_t next_index_ = 0;
873  Window window_;
874 
875  // Plugin functions
876  std::vector<on_chromosome_start> chromosome_start_plugins_;
877  std::vector<on_chromosome_finish> chromosome_finish_plugins_;
878  std::vector<on_enqueue> enqueue_plugins_;
879  std::vector<on_dequeue> dequeue_plugins_;
880  std::vector<on_emission> emission_plugins_;
881 
882 };
883 
884 } // namespace population
885 } // namespace genesis
886 
887 #endif // include guard
genesis::population::Window::accumulator
Accumulator & accumulator()
Get the Accumulator data that can be used for speeding up certain window computations.
Definition: window.hpp:378
genesis::population::SlidingWindowGenerator::emit_incomplete_windows
bool emit_incomplete_windows() const
Get whether the last (incomplete) window is also emitted.
Definition: sliding_window_generator.hpp:365
genesis::population::SlidingWindowGenerator::width
size_t width() const
Get the non-mutable width of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:342
genesis::population::SlidingWindowGenerator::operator=
SlidingWindowGenerator & operator=(SlidingWindowGenerator const &)=default
genesis::population::SlidingWindowGenerator< double >::on_chromosome_start
std::function< void(std::string const &chromosome, typename Window::Accumulator &accumulator)> on_chromosome_start
Plugin functions that are called on the first enqueue() of a newly started chromosome.
Definition: sliding_window_generator.hpp:204
genesis::population::SlidingWindowGenerator::add_emission_plugin
self_type & add_emission_plugin(on_emission const &plugin)
Add an on_emission plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:482
genesis::population::SlidingWindowGenerator::clear_plugins
void clear_plugins()
Clear all plugin functions.
Definition: sliding_window_generator.hpp:493
genesis::population::SlidingWindowGenerator< double >::on_emission
std::function< void(Window const &window)> on_emission
Main plugin functions that are called for every window.
Definition: sliding_window_generator.hpp:267
genesis::population::SlidingWindowGenerator::add_enqueue_plugin
self_type & add_enqueue_plugin(on_enqueue const &plugin)
Add an on_enqueue plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:464
genesis::population::Window< double, EmptyAccumulator >
genesis::population::SlidingWindowGenerator< double >::on_dequeue
std::function< void(typename Window::Entry const &entry, typename Window::Accumulator &accumulator)> on_dequeue
Plugin functions to update the Accumulator when Data is removed due to the window moving away from it...
Definition: sliding_window_generator.hpp:252
genesis::population::SlidingWindowGenerator::chromosome
std::string const & chromosome() const
Get the chromosome name that we are currently processing.
Definition: sliding_window_generator.hpp:408
genesis::population::SlidingWindowType::kChromosome
@ kChromosome
Windows of this type contain positions across a whole chromosome.
genesis::population::SlidingWindowGenerator::finish_chromosome
void finish_chromosome(size_t last_position=0)
Explicitly finish a chromosome, and emit all remaining Windows.
Definition: sliding_window_generator.hpp:611
genesis::population::SlidingWindowType::kVariants
@ kVariants
Windows of this type are defined as containing a fixed number of entries (usually,...
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
functions.hpp
genesis::population::SlidingWindowGenerator::clear
void clear()
Clear all data of the Window.
Definition: sliding_window_generator.hpp:432
genesis::population::SlidingWindowGenerator< double >::on_enqueue
std::function< void(typename Window::Entry const &entry, typename Window::Accumulator &accumulator)> on_enqueue
Plugin functions to update the Accumulator when new Data is enqueued.
Definition: sliding_window_generator.hpp:238
genesis::population::SlidingWindowGenerator::emit_incomplete_windows
void emit_incomplete_windows(bool value)
Set whether the last (incomplete) window is also emitted.
Definition: sliding_window_generator.hpp:377
genesis::population::BaseWindow::chromosome
std::string const & chromosome() const
Get the chromosome name that this Window belongs to.
Definition: base_window.hpp:90
genesis::population::Window::entries
container const & entries() const
Immediate container access to the Data Entries.
Definition: window.hpp:362
genesis::population::SlidingWindowType
SlidingWindowType
SlidingWindowType of a Window, that is, whether we slide along a fixed size interval of the genome,...
Definition: sliding_window_generator.hpp:55
genesis::population::EmptyAccumulator
Empty helper data struct to serve as a dummy for Window.
Definition: window.hpp:57
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(size_t position, Data const &data)
Enqueue a new Data value, without considering its chromosome.
Definition: sliding_window_generator.hpp:562
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::Window::Entry
Data that is stored per entry that was enqueued in a window.
Definition: window.hpp:125
genesis::population::SlidingWindowGenerator::add_chromosome_start_plugin
self_type & add_chromosome_start_plugin(on_chromosome_start const &plugin)
Add an on_chromosome_start plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:446
genesis::population::SlidingWindowGenerator::empty
bool empty() const
Return whether the instance is empty.
Definition: sliding_window_generator.hpp:421
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data &&data)
Enqueue a new Data value, by moving it.
Definition: sliding_window_generator.hpp:545
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(size_t position, Data &&data)
Enqueue a new Data value by moving it, without considering its chromosome.
Definition: sliding_window_generator.hpp:572
genesis::population::BaseWindow::clear
void clear()
Clear all data from the Window.
Definition: base_window.hpp:234
genesis::population::SlidingWindowGenerator::~SlidingWindowGenerator
~SlidingWindowGenerator()
Destruct the instance.
Definition: sliding_window_generator.hpp:312
genesis::population::SlidingWindowGenerator::SlidingWindowGenerator
SlidingWindowGenerator(SlidingWindowType type, size_t width, size_t stride=0)
Construct a SlidingWindowGenerator, given the SlidingWindowType and width, and potentially stride.
Definition: sliding_window_generator.hpp:289
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data const &data)
Enqueue a new Data value.
Definition: sliding_window_generator.hpp:534
window.hpp
genesis::population::SlidingWindowGenerator< double >::Data
double Data
Definition: sliding_window_generator.hpp:181
genesis::population::SlidingWindowType::kInterval
@ kInterval
Windows of this type are defined by a fixed start and end position on a chromosome.
genesis::population::SlidingWindowGenerator::window_type
SlidingWindowType window_type() const
Get the non-mutable SlidingWindowType of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:330
genesis::population::SlidingWindowGenerator::add_dequeue_plugin
self_type & add_dequeue_plugin(on_dequeue const &plugin)
Add an on_dequeue plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:473
genesis::population::SlidingWindowGenerator::add_chromosome_finish_plugin
self_type & add_chromosome_finish_plugin(on_chromosome_finish const &plugin)
Add an on_chromosome_finish plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:455
genesis::population::SlidingWindowGenerator::start_chromosome
void start_chromosome(std::string const &chromosome)
Signal the start of a new chromosome, given its name.
Definition: sliding_window_generator.hpp:512
genesis::population::SlidingWindowGenerator::stride
size_t stride() const
Get the non-mutable stride of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:355
genesis::population::SlidingWindowGenerator
Generator for sliding Windows over the chromosomes of a genome.
Definition: sliding_window_generator.hpp:173
genesis::population::SlidingWindowGenerator< double >::Entry
typename Window::Entry Entry
Definition: sliding_window_generator.hpp:184
genesis::population::SlidingWindowGenerator< double >::on_chromosome_finish
std::function< void(std::string const &chromosome, typename Window::Accumulator &accumulator)> on_chromosome_finish
Plugin functions that are called when finishing a chromosome.
Definition: sliding_window_generator.hpp:218