A library for working with phylogenetic and population genetic data.
v0.27.0
sliding_window_generator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_WINDOW_SLIDING_WINDOW_GENERATOR_H_
2 #define GENESIS_POPULATION_WINDOW_SLIDING_WINDOW_GENERATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
34 #include <cassert>
35 #include <deque>
36 #include <functional>
37 #include <stdexcept>
38 #include <string>
39 #include <vector>
40 
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Sliding Window Type
49 // =================================================================================================
50 
56 {
63  kInterval,
64 
69  kVariants,
70 
79 };
80 
81 // =================================================================================================
82 // Genomic Sliding Window Generator
83 // =================================================================================================
84 
164 template<class D, class A = EmptyAccumulator>
166 {
167 public:
168 
169  // -------------------------------------------------------------------------
170  // Typedefs and Enums
171  // -------------------------------------------------------------------------
172 
173  using Data = D;
174  using Accumulator = A;
176  using Entry = typename Window::Entry;
177 
179 
193  using on_chromosome_start = std::function<void(
194  std::string const& chromosome,
195  typename Window::Accumulator& accumulator
196  )>;
197 
207  using on_chromosome_finish = std::function<void(
208  std::string const& chromosome,
209  typename Window::Accumulator& accumulator
210  )>;
211 
227  using on_enqueue = std::function<void(
228  typename Window::Entry const& entry,
229  typename Window::Accumulator& accumulator
230  )>;
231 
241  using on_dequeue = std::function<void(
242  typename Window::Entry const& entry,
243  typename Window::Accumulator& accumulator
244  )>;
245 
257  using on_emission = std::function<void(
258  Window const& window
259  )>;
260 
261  // -------------------------------------------------------------------------
262  // Constructors and Rule of Five
263  // -------------------------------------------------------------------------
264 
265  // /**
266  // * @brief Create a default (empty) instance.
267  // *
268  // * Not really important for usage, as this cannot do much. Typically, one nees a window width
269  // * and stride for doing any work. But having a default constructor comes in handy at times.
270  // */
271  // SlidingWindowGenerator() = default;
272 
282  : window_type_(type)
283  , width_(width)
284  , stride_(stride)
285  {
286  if( width == 0 ) {
287  throw std::runtime_error( "Cannot use SlidingWindowGenerator of width 0." );
288  }
289  if( stride == 0 ) {
290  stride_ = width;
291  }
292  if( stride_ > width_ ) {
293  throw std::runtime_error( "Cannot use SlidingWindowGenerator with stride > width." );
294  }
295  }
296 
305  {
307  }
308 
311 
314 
315  // -------------------------------------------------------------------------
316  // Settings
317  // -------------------------------------------------------------------------
318 
323  {
324  return window_type_;
325  }
326 
334  size_t width() const
335  {
336  return width_;
337  }
338 
347  size_t stride() const
348  {
349  return stride_;
350  }
351 
358  {
359  return emit_incomplete_windows_;
360  }
361 
369  void emit_incomplete_windows( bool value )
370  {
371  emit_incomplete_windows_ = value;
372  }
373 
374  // /* *
375  // * @brief Get the WindowAnchorType that we use for the emitted Window%s.
376  // */
377  // WindowAnchorType anchor_type() const
378  // {
379  // return window_.anchor_type();
380  // }
381  //
382  // /* *
383  // * @brief Set the WindowAnchorType that we use for the emitted Window%s.
384  // */
385  // void anchor_type( WindowAnchorType value )
386  // {
387  // window_.anchor_type( value );
388  // }
389 
390  // -------------------------------------------------------------------------
391  // Accessors & Modifiers
392  // -------------------------------------------------------------------------
393 
400  std::string const& chromosome() const
401  {
402  // We could keep our own chromosome here, but Window already as a member for this,
403  // so we just re-use.
404  return window_.chromosome();
405  }
406 
413  bool empty() const
414  {
415  return next_index_ == 0;
416  }
417 
424  void clear()
425  {
426  current_start_ = 0;
427  next_index_ = 0;
428  window_.clear();
429  }
430 
431  // -------------------------------------------------------------------------
432  // Event Plugin Functionals
433  // -------------------------------------------------------------------------
434 
439  {
440  chromosome_start_plugins_.push_back(plugin);
441  return *this;
442  }
443 
448  {
449  chromosome_finish_plugins_.push_back(plugin);
450  return *this;
451  }
452 
457  {
458  enqueue_plugins_.push_back(plugin);
459  return *this;
460  }
461 
466  {
467  dequeue_plugins_.push_back(plugin);
468  return *this;
469  }
470 
475  {
476  emission_plugins_.push_back(plugin);
477  return *this;
478  }
479 
486  {
487  chromosome_start_plugins_.clear();
488  chromosome_finish_plugins_.clear();
489  enqueue_plugins_.clear();
490  dequeue_plugins_.clear();
491  emission_plugins_.clear();
492  }
493 
494  // -------------------------------------------------------------------------
495  // Enqueue and Generate Windows
496  // -------------------------------------------------------------------------
497 
504  void start_chromosome( std::string const& chromosome )
505  {
506  if( chromosome != window_.chromosome() ) {
508  window_.chromosome( chromosome );
509  }
510  }
511 
526  void enqueue( std::string const& chromosome, size_t position, Data const& data )
527  {
529  enqueue_( position, Data{ data });
530  }
531 
537  void enqueue( std::string const& chromosome, size_t position, Data&& data )
538  {
540  enqueue_( position, std::move( data ));
541  }
542 
554  void enqueue( size_t position, Data const& data )
555  {
556  enqueue_( position, Data{ data });
557  }
558 
564  void enqueue( size_t position, Data&& data )
565  {
566  enqueue_( position, std::move( data ));
567  }
568 
603  void finish_chromosome( size_t last_position = 0 )
604  {
605  // If nothing was enqueued yet, there is nothing to finish.
606  // This also makes sure that calling this function multiple times in a row does not
607  // have any side effects.
608  if( next_index_ == 0 ) {
609  return;
610  }
611 
612  // If we did not get a specific last position, we just finish the current interval.
613  if( last_position == 0 ) {
614  last_position = current_start_ + width_;
615  }
616 
617  // Boundary check. We make sure that the given position is neither in front of the current
618  // window, or, if there are entries in the list, also not in front of those.
619  // See enqueue_() for details. Same here.
620  size_t cur_end = window_.entries().empty() ? 0 : window_.entries().back().position;
621  cur_end = current_start_ > cur_end ? current_start_ - 1 : cur_end;
622  if( last_position <= cur_end ) {
623  throw std::runtime_error(
624  "Cannot call finish_chromosome() with position " + std::to_string(last_position) +
625  ", as the current window/chromosome is already advanced up to position " +
626  std::to_string( cur_end ) + "."
627  );
628  }
629 
630  // Emit the remaining data entries.
631  if( window_type_ == SlidingWindowType::kInterval ) {
632  synchronize_interval_( last_position );
633  assert( current_start_ <= last_position );
634  assert( last_position < current_start_ + width_ );
635 
636  // Special case for the emit_incomplete_windows_ setting. We have synchronized so that
637  // the given last_position is within the current interval. Now we need to emit that
638  // particular (incomplete) window and clean it up. The last p
639  if( emit_incomplete_windows_ ) {
640  emit_window_( current_start_, last_position + 1, last_position + 1 );
641  }
642 
643  } else if( window_type_ == SlidingWindowType::kVariants ) {
644  throw std::runtime_error( "Not yet implemented." );
645  } else {
646  assert( false );
647  }
648 
649  // Wrap up the chromosome, and clear, so that we can start a new chromosome cleanly.
650  for( auto const& chromosome_finish_plugin : chromosome_finish_plugins_ ) {
651  if( chromosome_finish_plugin ) {
652  chromosome_finish_plugin( window_.chromosome(), window_.accumulator() );
653  }
654  }
655  clear();
656  }
657 
658  // -------------------------------------------------------------------------
659  // General Internal Members
660  // -------------------------------------------------------------------------
661 
662 private:
663 
664  void enqueue_( size_t position, Data&& data )
665  {
666  // If this is the first enqueuing of the window or the chromosome,
667  // we need to call the start plugin.
668  if( next_index_ == 0 ) {
669  for( auto const& chromosome_start_plugin : chromosome_start_plugins_ ) {
670  if( chromosome_start_plugin ) {
671  chromosome_start_plugin( window_.chromosome(), window_.accumulator() );
672  }
673  }
674  }
675 
676  // Boundary check. We make sure that the given position is neither in front of the current
677  // window, or, if there are entries in the list, also not in front of those.
678  // (There might be cases were we are already in the middle of the chromosome, but the
679  // entries list is empty. Not entirely sure when this can occurr, but it feels like it can,
680  // and just checking this doesn't cost us much. If anyone wants to think this through,
681  // feel free.)
682  size_t cur_end = window_.entries().empty() ? 0 : window_.entries().back().position;
683  cur_end = current_start_ > cur_end ? current_start_ - 1 : cur_end;
684  if( position <= cur_end ) {
685  throw std::invalid_argument(
686  "Cannot enqueue at position " + std::to_string( position ) +
687  ", as the current window/chromosome is already advanced up to position " +
688  std::to_string( cur_end ) +
689  ". Either start a new window or a new chromosome within the window. "
690  "Note that this error might be caused by a VCF file that is not sorted by "
691  "chromosome and position."
692  );
693  }
694  assert( position >= current_start_ );
695  assert( window_.entries().empty() || position > window_.entries().back().position );
696 
697  // Do the correct type of enqueuing.
698  if( window_type_ == SlidingWindowType::kInterval ) {
699  enqueue_interval_( position, std::move( data ));
700  } else if( window_type_ == SlidingWindowType::kVariants ) {
701  throw std::runtime_error( "Not yet implemented." );
702  } else {
703  assert( false );
704  }
705  }
706 
707  // -------------------------------------------------------------------------
708  // Interval Internal Members
709  // -------------------------------------------------------------------------
710 
711 private:
712 
716  void enqueue_interval_( size_t position, Data&& data )
717  {
718  assert( window_type_ == SlidingWindowType::kInterval );
719 
720  // Make sure that we move to the interval where our position needs to be added to.
721  synchronize_interval_( position );
722  assert( current_start_ <= position );
723  assert( position < current_start_ + width_ );
724 
725  // Add the new data to our entry queue.
726  window_.entries().emplace_back( next_index_, position, std::move( data ));
727  ++next_index_;
728 
729  // Run the enqueue event plugins. We do not emit anything here. That will be done once
730  // the interval is finished, that is, above, when a new position outside of the interval
731  // is added (or we finish the whole iteration).
732  for( auto const& enqueue_plugin : enqueue_plugins_ ) {
733  if( enqueue_plugin ) {
734  assert( window_.entries().size() > 0 );
735  enqueue_plugin( window_.entries().back(), window_.accumulator() );
736  }
737  }
738 
739  // Make sure that all entries in the queue are within our current bounds,
740  // and are in the correct order. We use a lambda that we immediately call.
741  assert( [&](){
742  size_t cur_pos = 0;
743  for( auto it : window_.entries() ) {
744  if( it.position < current_start_ || it.position >= current_start_ + width_ ) {
745  return false;
746  }
747  if( it.position < cur_pos ) {
748  return false;
749  }
750  cur_pos = it.position;
751  }
752  return true;
753  }() );
754  }
755 
759  void synchronize_interval_( size_t position )
760  {
761  assert( window_type_ == SlidingWindowType::kInterval );
762 
763  // This function is only called internally, and only if we are sure that the position is
764  // valid. Let's assert this.
765  assert( position >= current_start_ );
766  assert( window_.entries().empty() || window_.entries().back().position < position );
767 
768  // Either there are no entries, or they are all within the current interval.
769  // That has to be the case, because we emit if we finish an interval, and remove the data.
770  // So, there should never be data that is from an old interval at this point here.
771  assert(
772  window_.entries().empty() || window_.entries().front().position >= current_start_
773  );
774  assert(
775  window_.entries().empty() || window_.entries().front().position < current_start_ + width_
776  );
777  assert(
778  window_.entries().empty() || window_.entries().back().position >= current_start_
779  );
780  assert(
781  window_.entries().empty() || window_.entries().back().position < current_start_ + width_
782  );
783 
784  // Emit the windows up to the position where we want to enqueue the new data entry.
785  // As we slide over intervals of fixed size along the genome, this can mean that we
786  // have to emit multiple (potentially empty) windows along the way, until we are at the
787  // interval that contains our new position.
788  while( current_start_ + width_ <= position ) {
789 
790  // Emit and move to next interval.
791  emit_window_( current_start_, current_start_ + width_, current_start_ + stride_ );
792  current_start_ += stride_;
793  }
794 
795  // We are now within the exact interval where we need to be.
796  assert( current_start_ <= position );
797  assert( position < current_start_ + width_ );
798  }
799 
803  void emit_window_( size_t first_position, size_t last_position, size_t dequeue_until )
804  {
805  assert( window_type_ == SlidingWindowType::kInterval );
806 
807  // Make sure that all entries in the queue are within our current bounds,
808  // and are in the correct order. We use a lambda that we immediately call.
809  assert( [&](){
810  size_t cur_pos = 0;
811  for( auto const& it : window_.entries() ) {
812  if( it.position < first_position || it.position >= last_position ) {
813  return false;
814  }
815  if( it.position < cur_pos ) {
816  return false;
817  }
818  cur_pos = it.position;
819  }
820  return true;
821  }() );
822 
823  // Prepare the window properties.
824  assert( last_position > first_position );
825  window_.first_position( first_position );
826  window_.last_position( last_position );
827 
828  // Now emit all plugin functions.
829  for( auto const& emission_plugin : emission_plugins_ ) {
830  if( emission_plugin ) {
831  emission_plugin( window_ );
832  }
833  }
834 
835  // Dequeue everything that just moved out of the current interval.
836  while(
837  window_.entries().size() > 0 && window_.entries().front().position < dequeue_until
838  ) {
839  for( auto const& dequeue_plugin : dequeue_plugins_ ) {
840  if( dequeue_plugin ) {
841  dequeue_plugin( window_.entries().front(), window_.accumulator() );
842  }
843  }
844  window_.entries().pop_front();
845  }
846  }
847 
848  // -------------------------------------------------------------------------
849  // Data Members
850  // -------------------------------------------------------------------------
851 
852 private:
853 
854  // Fixed settings
855  SlidingWindowType window_type_;
856  size_t width_;
857  size_t stride_;
858 
859  // Other settings
860  bool emit_incomplete_windows_ = false;
861 
862  // Current window and its position
863  size_t current_start_ = 0;
864  size_t next_index_ = 0;
865  Window window_;
866 
867  // Plugin functions
868  std::vector<on_chromosome_start> chromosome_start_plugins_;
869  std::vector<on_chromosome_finish> chromosome_finish_plugins_;
870  std::vector<on_enqueue> enqueue_plugins_;
871  std::vector<on_dequeue> dequeue_plugins_;
872  std::vector<on_emission> emission_plugins_;
873 
874 };
875 
876 } // namespace population
877 } // namespace genesis
878 
879 #endif // include guard
genesis::population::Window::accumulator
Accumulator & accumulator()
Get the Accumulator data that can be used for speeding up certain window computations.
Definition: window.hpp:477
genesis::population::SlidingWindowGenerator::emit_incomplete_windows
bool emit_incomplete_windows() const
Get whether the last (incomplete) window is also emitted.
Definition: sliding_window_generator.hpp:357
genesis::population::SlidingWindowGenerator::width
size_t width() const
Get the non-mutable width of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:334
genesis::population::SlidingWindowGenerator::operator=
SlidingWindowGenerator & operator=(SlidingWindowGenerator const &)=default
genesis::population::SlidingWindowGenerator< double >::on_chromosome_start
std::function< void(std::string const &chromosome, typename Window::Accumulator &accumulator)> on_chromosome_start
Plugin functions that are called on the first enqueue() of a newly started chromosome.
Definition: sliding_window_generator.hpp:196
genesis::population::SlidingWindowGenerator::add_emission_plugin
self_type & add_emission_plugin(on_emission const &plugin)
Add an on_emission plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:474
genesis::population::SlidingWindowGenerator::clear_plugins
void clear_plugins()
Clear all plugin functions.
Definition: sliding_window_generator.hpp:485
genesis::population::SlidingWindowGenerator< double >::on_emission
std::function< void(Window const &window)> on_emission
Main plugin functions that are called for every window.
Definition: sliding_window_generator.hpp:259
genesis::population::SlidingWindowGenerator::add_enqueue_plugin
self_type & add_enqueue_plugin(on_enqueue const &plugin)
Add an on_enqueue plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:456
genesis::population::Window< double, EmptyAccumulator >
genesis::population::SlidingWindowGenerator< double >::on_dequeue
std::function< void(typename Window::Entry const &entry, typename Window::Accumulator &accumulator)> on_dequeue
Plugin functions to update the Accumulator when Data is removed due to the window moving away from it...
Definition: sliding_window_generator.hpp:244
genesis::population::SlidingWindowGenerator::chromosome
std::string const & chromosome() const
Get the chromosome name that we are currently processing.
Definition: sliding_window_generator.hpp:400
genesis::population::SlidingWindowType::kChromosome
@ kChromosome
Windows of this type contain positions across a whole chromosome.
genesis::population::SlidingWindowGenerator::finish_chromosome
void finish_chromosome(size_t last_position=0)
Explicitly finish a chromosome, and emit all remaining Windows.
Definition: sliding_window_generator.hpp:603
genesis::population::SlidingWindowType::kVariants
@ kVariants
Windows of this type are defined as containing a fixed number of entries (usually,...
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
functions.hpp
genesis::population::SlidingWindowGenerator::clear
void clear()
Clear all data of the Window.
Definition: sliding_window_generator.hpp:424
genesis::population::SlidingWindowGenerator< double >::on_enqueue
std::function< void(typename Window::Entry const &entry, typename Window::Accumulator &accumulator)> on_enqueue
Plugin functions to update the Accumulator when new Data is enqueued.
Definition: sliding_window_generator.hpp:230
genesis::population::SlidingWindowGenerator::emit_incomplete_windows
void emit_incomplete_windows(bool value)
Set whether the last (incomplete) window is also emitted.
Definition: sliding_window_generator.hpp:369
genesis::population::Window::entries
container const & entries() const
Immediate container access to the Data Entries.
Definition: window.hpp:461
genesis::population::SlidingWindowType
SlidingWindowType
SlidingWindowType of a Window, that is, whether we slide along a fixed size interval of the genome,...
Definition: sliding_window_generator.hpp:55
genesis::population::EmptyAccumulator
Empty helper data struct to serve as a dummy for Window.
Definition: window.hpp:56
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(size_t position, Data const &data)
Enqueue a new Data value, without considering its chromosome.
Definition: sliding_window_generator.hpp:554
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::Window::Entry
Data that is stored per entry that was enqueued in a window.
Definition: window.hpp:124
genesis::population::SlidingWindowGenerator::add_chromosome_start_plugin
self_type & add_chromosome_start_plugin(on_chromosome_start const &plugin)
Add an on_chromosome_start plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:438
genesis::population::SlidingWindowGenerator::empty
bool empty() const
Return whether the instance is empty.
Definition: sliding_window_generator.hpp:413
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data &&data)
Enqueue a new Data value, by moving it.
Definition: sliding_window_generator.hpp:537
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(size_t position, Data &&data)
Enqueue a new Data value by moving it, without considering its chromosome.
Definition: sliding_window_generator.hpp:564
genesis::population::SlidingWindowGenerator::~SlidingWindowGenerator
~SlidingWindowGenerator()
Destruct the instance.
Definition: sliding_window_generator.hpp:304
genesis::population::SlidingWindowGenerator::SlidingWindowGenerator
SlidingWindowGenerator(SlidingWindowType type, size_t width, size_t stride=0)
Construct a SlidingWindowGenerator, given the SlidingWindowType and width, and potentially stride.
Definition: sliding_window_generator.hpp:281
genesis::population::SlidingWindowGenerator::enqueue
void enqueue(std::string const &chromosome, size_t position, Data const &data)
Enqueue a new Data value.
Definition: sliding_window_generator.hpp:526
window.hpp
genesis::population::SlidingWindowGenerator< double >::Data
double Data
Definition: sliding_window_generator.hpp:173
genesis::population::SlidingWindowType::kInterval
@ kInterval
Windows of this type are defined by a fixed start and end position on a chromosome.
genesis::population::SlidingWindowGenerator::window_type
SlidingWindowType window_type() const
Get the non-mutable SlidingWindowType of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:322
genesis::population::SlidingWindowGenerator::add_dequeue_plugin
self_type & add_dequeue_plugin(on_dequeue const &plugin)
Add an on_dequeue plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:465
genesis::population::SlidingWindowGenerator::add_chromosome_finish_plugin
self_type & add_chromosome_finish_plugin(on_chromosome_finish const &plugin)
Add an on_chromosome_finish plugin function, typically a lambda.
Definition: sliding_window_generator.hpp:447
genesis::population::Window::clear
void clear()
Clear all data from the Window.
Definition: window.hpp:526
genesis::population::SlidingWindowGenerator::start_chromosome
void start_chromosome(std::string const &chromosome)
Signal the start of a new chromosome, given its name.
Definition: sliding_window_generator.hpp:504
genesis::population::Window::chromosome
std::string const & chromosome() const
Get the chromosome name that this Window belongs to.
Definition: window.hpp:224
genesis::population::SlidingWindowGenerator::stride
size_t stride() const
Get the non-mutable stride of this SlidingWindowGenerator.
Definition: sliding_window_generator.hpp:347
genesis::population::SlidingWindowGenerator
Generator for sliding Windows over the chromosomes of a genome.
Definition: sliding_window_generator.hpp:165
genesis::population::SlidingWindowGenerator< double >::Entry
typename Window::Entry Entry
Definition: sliding_window_generator.hpp:176
genesis::population::SlidingWindowGenerator< double >::on_chromosome_finish
std::function< void(std::string const &chromosome, typename Window::Accumulator &accumulator)> on_chromosome_finish
Plugin functions that are called when finishing a chromosome.
Definition: sliding_window_generator.hpp:210