A library for working with phylogenetic data.
v0.25.0
sliding_window_generator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_SLIDING_WINDOW_GENERATOR_H_
2 #define GENESIS_POPULATION_WINDOW_SLIDING_WINDOW_GENERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2020 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
34 #include <cassert>
35 #include <deque>
36 #include <functional>
37 #include <stdexcept>
38 #include <string>
39 #include <vector>
40 
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Genomic Sliding Window Generator
48 // =================================================================================================
49 
123 template<class D, class A = EmptyAccumulator>
125 {
126 public:
127 
128  // -------------------------------------------------------------------------
129  // Typedefs and Enums
130  // -------------------------------------------------------------------------
131 
132  using Data = D;
133  using Accumulator = A;
135  using Entry = typename Window::Entry;
136 
138 
152  using on_chromosome_start = std::function<void(
153  std::string const& chromosome,
154  typename Window::Accumulator& accumulator
155  )>;
156 
166  using on_chromosome_finish = std::function<void(
167  std::string const& chromosome,
168  typename Window::Accumulator& accumulator
169  )>;
170 
186  using on_enqueue = std::function<void(
187  typename Window::Entry const& entry,
188  typename Window::Accumulator& accumulator
189  )>;
190 
200  using on_dequeue = std::function<void(
201  typename Window::Entry const& entry,
202  typename Window::Accumulator& accumulator
203  )>;
204 
216  using on_emission = std::function<void(
217  Window const& window
218  )>;
219 
220  // -------------------------------------------------------------------------
221  // Constructors and Rule of Five
222  // -------------------------------------------------------------------------
223 
224  // /**
225  // * @brief Create a default (empty) instance.
226  // *
227  // * Not really important for usage, as this cannot do much. Typically, one nees a window width
228  // * and stride for doing any work. But having a default constructor comes in handy at times.
229  // */
230  // SlidingWindowGenerator() = default;
231 
240  SlidingWindowGenerator( WindowType type, size_t width, size_t stride = 0 )
241  : window_type_(type)
242  , width_(width)
243  , stride_(stride)
244  {
245  if( width == 0 ) {
246  throw std::runtime_error( "Cannot use SlidingWindowGenerator of width 0." );
247  }
248  if( stride == 0 ) {
249  stride_ = width;
250  }
251  if( stride_ > width_ ) {
252  throw std::runtime_error( "Cannot use SlidingWindowGenerator with stride > width." );
253  }
254  }
255 
264  {
266  }
267 
270 
273 
274  // -------------------------------------------------------------------------
275  // Settings
276  // -------------------------------------------------------------------------
277 
282  {
283  return window_type_;
284  }
285 
293  size_t width() const
294  {
295  return width_;
296  }
297 
305  size_t stride() const
306  {
307  return stride_;
308  }
309 
316  {
317  return emit_incomplete_windows_;
318  }
319 
327  void emit_incomplete_windows( bool value )
328  {
329  emit_incomplete_windows_ = value;
330  }
331 
336  {
337  return window_.anchor_type();
338  }
339 
344  {
345  window_.anchor_type( value );
346  }
347 
348  // -------------------------------------------------------------------------
349  // Accessors & Modifiers
350  // -------------------------------------------------------------------------
351 
358  std::string const& chromosome() const
359  {
360  // We could keep our own chromosome here, but Window already as a member for this,
361  // so we just re-use.
362  return window_.chromosome();
363  }
364 
371  bool empty() const
372  {
373  return next_index_ == 0;
374  }
375 
382  void clear()
383  {
384  current_start_ = 0;
385  next_index_ = 0;
386  window_.clear();
387  }
388 
389  // -------------------------------------------------------------------------
390  // Event Plugin Functionals
391  // -------------------------------------------------------------------------
392 
397  {
398  chromosome_start_plugins_.push_back(plugin);
399  return *this;
400  }
401 
406  {
407  chromosome_finish_plugins_.push_back(plugin);
408  return *this;
409  }
410 
415  {
416  enqueue_plugins_.push_back(plugin);
417  return *this;
418  }
419 
424  {
425  dequeue_plugins_.push_back(plugin);
426  return *this;
427  }
428 
433  {
434  emission_plugins_.push_back(plugin);
435  return *this;
436  }
437 
444  {
445  chromosome_start_plugins_.clear();
446  chromosome_finish_plugins_.clear();
447  enqueue_plugins_.clear();
448  dequeue_plugins_.clear();
449  emission_plugins_.clear();
450  }
451 
452  // -------------------------------------------------------------------------
453  // Enqueue and Generate Windows
454  // -------------------------------------------------------------------------
455 
462  void start_chromosome( std::string const& chromosome )
463  {
464  if( chromosome != window_.chromosome() ) {
466  window_.chromosome( chromosome );
467  }
468  }
469 
484  void enqueue( std::string const& chromosome, size_t position, Data const& data )
485  {
487  enqueue_( position, Data{ data });
488  }
489 
495  void enqueue( std::string const& chromosome, size_t position, Data&& data )
496  {
498  enqueue_( position, std::move( data ));
499  }
500 
512  void enqueue( size_t position, Data const& data )
513  {
514  enqueue_( position, Data{ data });
515  }
516 
522  void enqueue( size_t position, Data&& data )
523  {
524  enqueue_( position, std::move( data ));
525  }
526 
561  void finish_chromosome( size_t last_position = 0 )
562  {
563  // If nothing was enqueued yet, there is nothing to finish.
564  // This also makes sure that calling this function multiple times in a row does not
565  // have any side effects.
566  if( next_index_ == 0 ) {
567  return;
568  }
569 
570  // If we did not get a specific last position, we just finish the current interval.
571  if( last_position == 0 ) {
572  last_position = current_start_ + width_;
573  }
574 
575  // Boundary check. We make sure that the given position is neither in front of the current
576  // window, or, if there are entries in the list, also not in front of those.
577  // See enqueue_() for details. Same here.
578  size_t cur_end = window_.entries().empty() ? 0 : window_.entries().back().position;
579  cur_end = current_start_ > cur_end ? current_start_ - 1 : cur_end;
580  if( last_position <= cur_end ) {
581  throw std::runtime_error(
582  "Cannot call finish_chromosome() with position " + std::to_string(last_position) +
583  ", as the current window/chromosome is already advanced up to position " +
584  std::to_string( cur_end ) + "."
585  );
586  }
587 
588  // Emit the remaining data entries.
589  if( window_type_ == WindowType::kInterval ) {
590  synchronize_interval_( last_position );
591  assert( current_start_ <= last_position );
592  assert( last_position < current_start_ + width_ );
593 
594  // Special case for the emit_incomplete_windows_ setting. We have synchronized so that
595  // the given last_position is within the current interval. Now we need to emit that
596  // particular (incomplete) window and clean it up. The last p
597  if( emit_incomplete_windows_ ) {
598  emit_window_( current_start_, last_position + 1, last_position + 1 );
599  }
600 
601  } else if( window_type_ == WindowType::kVariants ) {
602  throw std::runtime_error( "Not yet implemented." );
603  } else {
604  assert( false );
605  }
606 
607  // Wrap up the chromosome, and clear, so that we can start a new chromosome cleanly.
608  for( auto const& chromosome_finish_plugin : chromosome_finish_plugins_ ) {
609  if( chromosome_finish_plugin ) {
610  chromosome_finish_plugin( window_.chromosome(), window_.accumulator() );
611  }
612  }
613  clear();
614  }
615 
616  // -------------------------------------------------------------------------
617  // General Internal Members
618  // -------------------------------------------------------------------------
619 
620 private:
621 
622  void enqueue_( size_t position, Data&& data )
623  {
624  // If this is the first enqueuing of the window or the chromosome,
625  // we need to call the start plugin.
626  if( next_index_ == 0 ) {
627  for( auto const& chromosome_start_plugin : chromosome_start_plugins_ ) {
628  if( chromosome_start_plugin ) {
629  chromosome_start_plugin( window_.chromosome(), window_.accumulator() );
630  }
631  }
632  }
633 
634  // Boundary check. We make sure that the given position is neither in front of the current
635  // window, or, if there are entries in the list, also not in front of those.
636  // (There might be cases were we are already in the middle of the chromosome, but the
637  // entries list is empty. Not entirely sure when this can occurr, but it feels like it can,
638  // and just checking this doesn't cost us much. If anyone wants to think this through,
639  // feel free.)
640  size_t cur_end = window_.entries().empty() ? 0 : window_.entries().back().position;
641  cur_end = current_start_ > cur_end ? current_start_ - 1 : cur_end;
642  if( position <= cur_end ) {
643  throw std::invalid_argument(
644  "Cannot enqueue at position " + std::to_string( position ) +
645  ", as the current window/chromosome is already advanced up to position " +
646  std::to_string( cur_end ) +
647  ". Either start a new window or a new chromosome within the window. "
648  "Note that this error might be caused by a VCF file that is not sorted by "
649  "chromosome and position."
650  );
651  }
652  assert( position >= current_start_ );
653  assert( window_.entries().empty() || position > window_.entries().back().position );
654 
655  // Do the correct type of enqueuing.
656  if( window_type_ == WindowType::kInterval ) {
657  enqueue_interval_( position, std::move( data ));
658  } else if( window_type_ == WindowType::kVariants ) {
659  throw std::runtime_error( "Not yet implemented." );
660  } else {
661  assert( false );
662  }
663  }
664 
665  // -------------------------------------------------------------------------
666  // Interval Internal Members
667  // -------------------------------------------------------------------------
668 
669 private:
670 
674  void enqueue_interval_( size_t position, Data&& data )
675  {
676  assert( window_type_ == WindowType::kInterval );
677 
678  // Make sure that we move to the interval where our position needs to be added to.
679  synchronize_interval_( position );
680  assert( current_start_ <= position );
681  assert( position < current_start_ + width_ );
682 
683  // Add the new data to our entry queue.
684  window_.entries().emplace_back( next_index_, position, std::move( data ));
685  ++next_index_;
686 
687  // Run the enqueue event plugins. We do not emit anything here. That will be done once
688  // the interval is finished, that is, above, when a new position outside of the interval
689  // is added (or we finish the whole iteration).
690  for( auto const& enqueue_plugin : enqueue_plugins_ ) {
691  if( enqueue_plugin ) {
692  assert( window_.entries().size() > 0 );
693  enqueue_plugin( window_.entries().back(), window_.accumulator() );
694  }
695  }
696 
697  // Make sure that all entries in the queue are within our current bounds,
698  // and are in the correct order. We use a lambda that we immediately call.
699  assert( [&](){
700  size_t cur_pos = 0;
701  for( auto it : window_.entries() ) {
702  if( it.position < current_start_ || it.position >= current_start_ + width_ ) {
703  return false;
704  }
705  if( it.position < cur_pos ) {
706  return false;
707  }
708  cur_pos = it.position;
709  }
710  return true;
711  }() );
712  }
713 
717  void synchronize_interval_( size_t position )
718  {
719  assert( window_type_ == WindowType::kInterval );
720 
721  // This function is only called internally, and only if we are sure that the position is
722  // valid. Let's assert this.
723  assert( position >= current_start_ );
724  assert( window_.entries().empty() || window_.entries().back().position < position );
725 
726  // Either there are no entries, or they are all within the current interval.
727  // That has to be the case, because we emit if we finish an interval, and remove the data.
728  // So, there should never be data that is from an old interval at this point here.
729  assert(
730  window_.entries().empty() || window_.entries().front().position >= current_start_
731  );
732  assert(
733  window_.entries().empty() || window_.entries().front().position < current_start_ + width_
734  );
735  assert(
736  window_.entries().empty() || window_.entries().back().position >= current_start_
737  );
738  assert(
739  window_.entries().empty() || window_.entries().back().position < current_start_ + width_
740  );
741 
742  // Emit the windows up to the position where we want to enqueue the new data entry.
743  // As we slide over intervals of fixed size along the genome, this can mean that we
744  // have to emit multiple (potentially empty) windows along the way, until we are at the
745  // interval that contains our new position.
746  while( current_start_ + width_ <= position ) {
747 
748  // Emit and move to next interval.
749  emit_window_( current_start_, current_start_ + width_, current_start_ + stride_ );
750  current_start_ += stride_;
751  }
752 
753  // We are now within the exact interval where we need to be.
754  assert( current_start_ <= position );
755  assert( position < current_start_ + width_ );
756  }
757 
761  void emit_window_( size_t first_position, size_t last_position, size_t dequeue_until )
762  {
763  assert( window_type_ == WindowType::kInterval );
764 
765  // Make sure that all entries in the queue are within our current bounds,
766  // and are in the correct order. We use a lambda that we immediately call.
767  assert( [&](){
768  size_t cur_pos = 0;
769  for( auto const& it : window_.entries() ) {
770  if( it.position < first_position || it.position >= last_position ) {
771  return false;
772  }
773  if( it.position < cur_pos ) {
774  return false;
775  }
776  cur_pos = it.position;
777  }
778  return true;
779  }() );
780 
781  // Prepare the window properties.
782  assert( last_position > first_position );
783  window_.first_position( first_position );
784  window_.last_position( last_position );
785 
786  // Now emit all plugin functions.
787  for( auto const& emission_plugin : emission_plugins_ ) {
788  if( emission_plugin ) {
789  emission_plugin( window_ );
790  }
791  }
792 
793  // Dequeue everything that just moved out of the current interval.
794  while(
795  window_.entries().size() > 0 && window_.entries().front().position < dequeue_until
796  ) {
797  for( auto const& dequeue_plugin : dequeue_plugins_ ) {
798  if( dequeue_plugin ) {
799  dequeue_plugin( window_.entries().front(), window_.accumulator() );
800  }
801  }
802  window_.entries().pop_front();
803  }
804  }
805 
806  // -------------------------------------------------------------------------
807  // Data Members
808  // -------------------------------------------------------------------------
809 
810 private:
811 
812  // Fixed settings
813  WindowType window_type_;
814  size_t width_;
815  size_t stride_;
816 
817  // Other settings
818  bool emit_incomplete_windows_ = false;
819 
820  // Current window and its position
821  size_t current_start_ = 0;
822  size_t next_index_ = 0;
823  Window window_;
824 
825  // Plugin functions
826  std::vector<on_chromosome_start> chromosome_start_plugins_;
827  std::vector<on_chromosome_finish> chromosome_finish_plugins_;
828  std::vector<on_enqueue> enqueue_plugins_;
829  std::vector<on_dequeue> dequeue_plugins_;
830  std::vector<on_emission> emission_plugins_;
831 
832 };
833 
834 } // namespace population
835 } // namespace genesis
836 
837 #endif // include guard
genesis::population::Window::accumulator
Accumulator & accumulator()
Get the Accumulator data that can be used for speeding up certain window computations.
Definition: window.hpp:553
genesis::population::SlidingWindowGenerator::emit_incomplete_windows
bool emit_incomplete_windows() const
Get whether the last (incomplete) window is also emitted.
Definition: sliding_window_generator.hpp:315
genesis::population::SlidingWindowGenerator::width
size_t width() const
Get the non-mutable width of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:293
genesis::population::SlidingWindowGenerator::operator=
SlidingWindowGenerator & operator=(SlidingWindowGenerator const &)=default
genesis::population::SlidingWindowGenerator< double >::on_chromosome_start
std::function< void(std::string const &chromosome, typename Window::Accumulator &accumulator)> on_chromosome_start
Plugin functions that are called on the first enqueue() of a newly started chromosome.
Definition: sliding_window_generator.hpp:155
genesis::population::SlidingWindowGenerator::add_emission_plugin
self_type & add_emission_plugin(on_emission const &plugin)
Add an on_emission plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:432
genesis::population::SlidingWindowGenerator::clear_plugins
void clear_plugins()
Clear all plugin functions.
Definition: sliding_window_generator.hpp:443
genesis::population::SlidingWindowGenerator< double >::on_emission
std::function< void(Window const &window)> on_emission
Main plugin functions that are called for every window.
Definition: sliding_window_generator.hpp:218
genesis::population::SlidingWindowGenerator::add_enqueue_plugin
self_type & add_enqueue_plugin(on_enqueue const &plugin)
Add an on_enqueue plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:414
genesis::population::Window< double, EmptyAccumulator >
genesis::population::SlidingWindowGenerator::anchor_type
WindowAnchorType anchor_type() const
Get the WindowAnchorType that we use for the emitted Windows.
Definition: sliding_window_generator.hpp:335
genesis::population::SlidingWindowGenerator::SlidingWindowGenerator
SlidingWindowGenerator(WindowType type, size_t width, size_t stride=0)
Construct a SlidingWindowGenerator, given the WindowType and width, and potentially stride.
Definition: sliding_window_generator.hpp:240
genesis::population::SlidingWindowGenerator< double >::on_dequeue
std::function< void(typename Window::Entry const &entry, typename Window::Accumulator &accumulator)> on_dequeue
Plugin functions to update the Accumulator when Data is removed due to the window moving away from it...
Definition: sliding_window_generator.hpp:203
genesis::population::WindowType::kVariants
@ kVariants
genesis::population::SlidingWindowGenerator::chromosome
std::string const & chromosome() const
Get the chromosome name that we are currently processing.
Definition: sliding_window_generator.hpp:358
genesis::population::SlidingWindowGenerator::finish_chromosome
void finish_chromosome(size_t last_position=0)
Explicitly finish a chromosome, and emit all remaining Windows.
Definition: sliding_window_generator.hpp:561
genesis::population::SlidingWindowGenerator::clear
void clear()
Clear all data of the Window.
Definition: sliding_window_generator.hpp:382
genesis::population::SlidingWindowGenerator< double >::on_enqueue
std::function< void(typename Window::Entry const &entry, typename Window::Accumulator &accumulator)> on_enqueue
Plugin functions to update the Accumulator when new Data is enqueued.
Definition: sliding_window_generator.hpp:189
genesis::population::SlidingWindowGenerator::emit_incomplete_windows
void emit_incomplete_windows(bool value)
Set whether the last (incomplete) window is also emitted.
Definition: sliding_window_generator.hpp:327
genesis::population::WindowType::kInterval
@ kInterval
genesis::population::Window::entries
container const & entries() const
Immediate container access to the Data Entries.
Definition: window.hpp:537
genesis::population::EmptyAccumulator
Empty helper data struct to serve as a dummy for Window.
Definition: window.hpp:88
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(size_t position, Data const &data)
Enqueue a new Data value, without considering its chromosome.
Definition: sliding_window_generator.hpp:512
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::Window::Entry
Data that is stored per entry that was enqueued in a window.
Definition: window.hpp:127
genesis::population::SlidingWindowGenerator::add_chromosome_start_plugin
self_type & add_chromosome_start_plugin(on_chromosome_start const &plugin)
Add an on_chromosome_start plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:396
genesis::population::SlidingWindowGenerator::empty
bool empty() const
Return whether the instance is empty.
Definition: sliding_window_generator.hpp:371
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data &&data)
Enqueue a new Data value, by moving it.
Definition: sliding_window_generator.hpp:495
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(size_t position, Data &&data)
Enqueue a new Data value by moving it, without considering its chromosome.
Definition: sliding_window_generator.hpp:522
genesis::population::SlidingWindowGenerator::~SlidingWindowGenerator
~SlidingWindowGenerator()
Destruct the instance.
Definition: sliding_window_generator.hpp:263
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data const &data)
Enqueue a new Data value.
Definition: sliding_window_generator.hpp:484
window.hpp
genesis::population::SlidingWindowGenerator< double >::Data
double Data
Definition: sliding_window_generator.hpp:132
genesis::population::Window::anchor_type
WindowAnchorType anchor_type() const
Get the WindowAnchorType that is currently set for using anchor_position().
Definition: window.hpp:444
genesis::population::SlidingWindowGenerator::add_dequeue_plugin
self_type & add_dequeue_plugin(on_dequeue const &plugin)
Add an on_dequeue plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:423
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::population::SlidingWindowGenerator::add_chromosome_finish_plugin
self_type & add_chromosome_finish_plugin(on_chromosome_finish const &plugin)
Add an on_chromosome_finish plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:405
genesis::population::SlidingWindowGenerator::anchor_type
void anchor_type(WindowAnchorType value)
Set the WindowAnchorType that we use for the emitted Windows.
Definition: sliding_window_generator.hpp:343
genesis::population::Window::clear
void clear()
Clear all data from the Window.
Definition: window.hpp:574
genesis::population::SlidingWindowGenerator::start_chromosome
void start_chromosome(std::string const &chromosome)
Signal the start of a new chromosome, given its name.
Definition: sliding_window_generator.hpp:462
genesis::population::WindowType
WindowType
WindowType of a Window, that is, whether we slide along a fixed size interval of the genome,...
Definition: window.hpp:51
genesis::population::WindowAnchorType
WindowAnchorType
Position in the genome that is used for reporting when emitting or using a window.
Definition: window.hpp:69
genesis::population::Window::chromosome
std::string const & chromosome() const
Get the chromosome name that this Window belongs to.
Definition: window.hpp:222
genesis::population::SlidingWindowGenerator::stride
size_t stride() const
Get the non-mutable stride of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:305
genesis::population::SlidingWindowGenerator::window_type
WindowType window_type() const
Get the non-mutable WindowType of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:281
genesis::population::SlidingWindowGenerator
Generator for sliding Windows over the chromosomes of a genome.
Definition: sliding_window_generator.hpp:124
genesis::population::SlidingWindowGenerator< double >::Entry
typename Window::Entry Entry
Definition: sliding_window_generator.hpp:135
genesis::population::SlidingWindowGenerator< double >::on_chromosome_finish
std::function< void(std::string const &chromosome, typename Window::Accumulator &accumulator)> on_chromosome_finish
Plugin functions that are called when finishing a chromosome.
Definition: sliding_window_generator.hpp:169