A library for working with phylogenetic and population genetic data.
v0.32.0
diversity_pool_processor.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_DIVERSITY_POOL_PROCESSOR_H_
2 #define GENESIS_POPULATION_FUNCTION_DIVERSITY_POOL_PROCESSOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
44 
45 #include <algorithm>
46 #include <cassert>
47 #include <memory>
48 #include <stdexcept>
49 #include <string>
50 #include <utility>
51 #include <vector>
52 
53 namespace genesis {
54 namespace population {
55 
56 // =================================================================================================
57 // Diversity Pool Processor
58 // =================================================================================================
59 
68 {
69 public:
70 
71  // -------------------------------------------------------------------------
72  // Constructors and Rule of Five
73  // -------------------------------------------------------------------------
74 
83  DiversityPoolProcessor() = default;
84 
94  WindowAveragePolicy window_average_policy,
95  std::shared_ptr<utils::ThreadPool> thread_pool = nullptr,
96  size_t threading_threshold = 4096
97  )
98  : avg_policy_( window_average_policy )
99  , thread_pool_( thread_pool )
100  , threading_threshold_( threading_threshold )
101  , is_default_constructed_( false )
102  {
104  }
105 
106  ~DiversityPoolProcessor() = default;
107 
110 
113 
114  // -------------------------------------------------------------------------
115  // Setup
116  // -------------------------------------------------------------------------
117 
121  std::shared_ptr<utils::ThreadPool> thread_pool() const
122  {
123  return thread_pool_;
124  }
125 
132  DiversityPoolProcessor& thread_pool( std::shared_ptr<utils::ThreadPool> value )
133  {
134  thread_pool_ = value;
135  return *this;
136  }
137 
138  size_t threading_threshold() const
139  {
140  return threading_threshold_;
141  }
142 
153  {
154  threading_threshold_ = value;
155  return *this;
156  }
157 
166  DiversityPoolSettings const& settings,
167  std::vector<size_t> const& pool_sizes
168  ) {
169  if( ! calculators_.empty() ) {
170  throw std::runtime_error(
171  "Cannot call DiversityPoolProcessor::add_calculators() multiple times."
172  );
173  }
174  for( auto pool_size : pool_sizes ) {
175  calculators_.push_back( DiversityPoolCalculator( settings, pool_size ));
176  results_.emplace_back();
177  }
178  }
179 
180  // -------------------------------------------------------------------------
181  // Calculator Functions
182  // -------------------------------------------------------------------------
183 
184  size_t size() const
185  {
186  return calculators_.size();
187  }
188 
189  void reset()
190  {
191  assert( results_.size() == calculators_.size() );
192  for( size_t i = 0; i < results_.size(); ++i ) {
193  calculators_[i].reset();
194  results_[i] = DiversityPoolCalculator::Result();
195  }
196  filter_stats_.clear();
197  }
198 
199  void process( Variant const& variant )
200  {
201  // Check correct usage
202  if( is_default_constructed_ ) {
203  throw std::domain_error( "Cannot use a default constructed FstPoolProcessor" );
204  }
205 
206  // Boundary error check. We do this before any other processing of the Variant,
207  // as this indicates a serious error or issue with the data somewhere,
208  // which we want to catch in any case.
209  if( variant.samples.size() != calculators_.size() ) {
210  throw std::runtime_error(
211  "Invalid number of samples when computing Diversity Pool: Variant contains " +
212  std::to_string( variant.samples.size() ) + " samples, but " +
213  std::to_string( calculators_.size() ) + " pool sizes have been provided."
214  );
215  }
216 
217  // Only process Variants that are passing, but keep track of the ones that did not.
218  ++filter_stats_[variant.status.get()];
219  if( ! variant.status.passing() ) {
220  return;
221  }
222 
223  // Helper function for the processing.
224  auto process_ = [&]( size_t index ) {
225  assert( index < calculators_.size() );
226  calculators_[index].process( variant.samples[index] );
227  };
228 
229  // Switch dynamically between threading and no threading for the processing.
230  assert( variant.samples.size() == calculators_.size() );
231  if( thread_pool_ && calculators_.size() >= threading_threshold_ ) {
232  parallel_for(
233  0, variant.samples.size(), process_, thread_pool_
234  );
235  } else {
236  for( size_t i = 0; i < variant.samples.size(); ++i ) {
237  process_(i);
238  }
239  }
240  }
241 
251  template<class D>
252  std::vector<DiversityPoolCalculator::Result> const& get_result(
253  BaseWindow<D> const& window,
254  std::shared_ptr<GenomeLocusSet> provided_loci
255  ) const {
256  assert( results_.size() == calculators_.size() );
257  for( size_t i = 0; i < results_.size(); ++i ) {
258  auto const window_avg_denom = window_average_denominator(
259  avg_policy_, window, provided_loci, filter_stats_, calculators_[i].get_filter_stats()
260  );
261  results_[i] = calculators_[i].get_result( window_avg_denom );
262  }
263  return results_;
264  }
265 
272  std::vector<DiversityPoolCalculator::Result> const& get_result() const
273  {
274  assert( results_.size() == calculators_.size() );
275  for( size_t i = 0; i < results_.size(); ++i ) {
276  results_[i] = calculators_[i].get_result( 1 );
277  }
278  return results_;
279  }
280 
289  {
290  return filter_stats_;
291  }
292 
293  std::vector<DiversityPoolCalculator> const& calculators() const
294  {
295  return calculators_;
296  }
297 
298  // -------------------------------------------------------------------------
299  // Calculator Iterator
300  // -------------------------------------------------------------------------
301 
302  std::vector<DiversityPoolCalculator>::iterator begin()
303  {
304  return calculators_.begin();
305  }
306 
307  std::vector<DiversityPoolCalculator>::iterator end()
308  {
309  return calculators_.end();
310  }
311 
312  // -------------------------------------------------------------------------
313  // Internal Members
314  // -------------------------------------------------------------------------
315 
316 private:
317 
318  // We force the correct usage of the window averaging policy here,
319  // so that we make misinterpretation of the values less likely.
320  WindowAveragePolicy avg_policy_;
321 
322  // Orocessors to use for these computations, which keep all the data they need.
323  std::vector<DiversityPoolCalculator> calculators_;
324 
325  // Count how many Variants were processed in this processor,
326  // and how many of them passed or failed the filters.
327  VariantFilterStats filter_stats_;
328 
329  // We keep a mutable cache for the results, to avoid reallocating memory each time.
330  mutable std::vector<DiversityPoolCalculator::Result> results_;
331 
332  // Thread pool to run the buffering in the background, and the size (number of sample pairs)
333  // at which we start using the thread pool.
334  std::shared_ptr<utils::ThreadPool> thread_pool_;
335  size_t threading_threshold_ = 0;
336 
337  // We want to make sure to disallow default constructed instances.
338  // Bit ugly to do it this way, but works for now.
339  bool is_default_constructed_ = true;
340 };
341 
342 // =================================================================================================
343 // Make Diversity Processor Helper Functions
344 // =================================================================================================
345 
357  WindowAveragePolicy window_average_policy,
358  DiversityPoolSettings const& settings,
359  std::vector<size_t> const& pool_sizes
360 ) {
361  DiversityPoolProcessor processor{ window_average_policy };
362  processor.add_calculators( settings, pool_sizes );
363  return processor;
364 }
365 
366 } // namespace population
367 } // namespace genesis
368 
369 #endif // include guard
thread_functions.hpp
genesis::population::DiversityPoolProcessor::thread_pool
DiversityPoolProcessor & thread_pool(std::shared_ptr< utils::ThreadPool > value)
Set the thread pool used for processing, if enough sample pairs are being processed.
Definition: diversity_pool_processor.hpp:132
genesis::population::FilterStatus::get
IntType get() const
Definition: filter_status.hpp:91
genesis::population::DiversityPoolProcessor::operator=
DiversityPoolProcessor & operator=(DiversityPoolProcessor const &)=default
genesis::population::DiversityPoolProcessor
Helper class to iterate over Variants and process the samples (SampleCounts), using a set of Diversit...
Definition: diversity_pool_processor.hpp:67
genesis::population::DiversityPoolProcessor::reset
void reset()
Definition: diversity_pool_processor.hpp:189
base_window.hpp
genesis::population::DiversityPoolProcessor::get_result
std::vector< DiversityPoolCalculator::Result > const & get_result(BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci) const
Get a list of all resulting values for all samples.
Definition: diversity_pool_processor.hpp:252
genesis::utils::Options::global_thread_pool
std::shared_ptr< ThreadPool > global_thread_pool() const
Return a global thread pool to be used for parallel computations.
Definition: options.hpp:268
genesis::population::DiversityPoolProcessor::get_filter_stats
VariantFilterStats get_filter_stats() const
Get the sum of filter statistics of all Variants processed here.
Definition: diversity_pool_processor.hpp:288
genesis::utils::parallel_for
MultiFuture< void > parallel_for(T1 begin, T2 end, F &&body, std::shared_ptr< ThreadPool > thread_pool=nullptr, size_t num_blocks=0, bool auto_wait=true)
Parallel for over a range of positions, breaking the range into blocks for which the body function is...
Definition: thread_functions.hpp:259
std.hpp
Provides some valuable additions to STD.
genesis::population::DiversityPoolProcessor::end
std::vector< DiversityPoolCalculator >::iterator end()
Definition: diversity_pool_processor.hpp:307
genesis::population::DiversityPoolCalculator::Result
Data struct to collect all diversity statistics computed here.
Definition: diversity_pool_calculator.hpp:139
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::population::DiversityPoolProcessor::size
size_t size() const
Definition: diversity_pool_processor.hpp:184
genesis::population::DiversityPoolProcessor::DiversityPoolProcessor
DiversityPoolProcessor()=default
Default constructor.
window_average.hpp
genesis::population::DiversityPoolProcessor::process
void process(Variant const &variant)
Definition: diversity_pool_processor.hpp:199
genesis::population::DiversityPoolProcessor::add_calculators
void add_calculators(DiversityPoolSettings const &settings, std::vector< size_t > const &pool_sizes)
Create and add a set of calculators for a given list of samples.
Definition: diversity_pool_processor.hpp:165
genesis::population::DiversityPoolProcessor::~DiversityPoolProcessor
~DiversityPoolProcessor()=default
genesis::population::DiversityPoolProcessor::begin
std::vector< DiversityPoolCalculator >::iterator begin()
Definition: diversity_pool_processor.hpp:302
genesis::population::make_diversity_pool_processor
DiversityPoolProcessor make_diversity_pool_processor(WindowAveragePolicy window_average_policy, DiversityPoolSettings const &settings, std::vector< size_t > const &pool_sizes)
Create an DiversityPoolProcessor to compute diversity for all samples.
Definition: diversity_pool_processor.hpp:356
genesis::population::DiversityPoolProcessor::threading_threshold
DiversityPoolProcessor & threading_threshold(size_t value)
Set the threshold of calculators after which the processing is done in threads.
Definition: diversity_pool_processor.hpp:152
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::FilterStatus::passing
bool passing() const
Definition: filter_status.hpp:75
genesis::population::window_average_denominator
double window_average_denominator(WindowAveragePolicy policy, BaseWindow< D > const &window, std::shared_ptr< GenomeLocusSet > provided_loci, VariantFilterStats const &variant_filter_stats, SampleCountsFilterStats const &sample_counts_filter_stats)
Get the denoninator to use for averaging an estimator across a window.
Definition: window_average.hpp:257
genome_locus_set.hpp
genesis::population::DiversityPoolProcessor::threading_threshold
size_t threading_threshold() const
Definition: diversity_pool_processor.hpp:138
variant_filter.hpp
genesis::population::WindowAveragePolicy
WindowAveragePolicy
Select the method to use for computing window averages of statistic estimators.
Definition: window_average.hpp:65
diversity_pool_calculator.hpp
options.hpp
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::DiversityPoolProcessor::DiversityPoolProcessor
DiversityPoolProcessor(WindowAveragePolicy window_average_policy, std::shared_ptr< utils::ThreadPool > thread_pool=nullptr, size_t threading_threshold=4096)
Construct a processor.
Definition: diversity_pool_processor.hpp:93
genesis::population::FilterStats< VariantFilterTag >
variant.hpp
genesis::utils::Options::get
static Options & get()
Returns a single instance of this class.
Definition: options.hpp:68
genesis::population::DiversityPoolProcessor::calculators
std::vector< DiversityPoolCalculator > const & calculators() const
Definition: diversity_pool_processor.hpp:293
genesis::population::FilterStats::clear
void clear()
Definition: filter_stats.hpp:175
genesis::population::BaseWindow
Base class for Window and WindowView, to share common functionality.
Definition: base_window.hpp:60
genesis::population::DiversityPoolProcessor::get_result
std::vector< DiversityPoolCalculator::Result > const & get_result() const
Get a list of all resulting values for all samples.
Definition: diversity_pool_processor.hpp:272
thread_pool.hpp
genesis::population::DiversityPoolProcessor::thread_pool
std::shared_ptr< utils::ThreadPool > thread_pool() const
Get the thread pool used for processing, if enough sample pairs are being processed.
Definition: diversity_pool_processor.hpp:121
genesis::population::DiversityPoolCalculator
Compute Theta Pi, Theta Watterson, and Tajia's D in their pool-sequencing corrected versions accordin...
Definition: diversity_pool_calculator.hpp:122
genesis::population::DiversityPoolSettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity_pool_functions.hpp:141
genesis::population::Variant::status
FilterStatus status
Definition: variant.hpp:76