A library for working with phylogenetic and population genetic data.
v0.32.0
diversity_pool_calculator.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_DIVERSITY_POOL_CALCULATOR_H_
2 #define GENESIS_POPULATION_FUNCTION_DIVERSITY_POOL_CALCULATOR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
39 
40 #include <cassert>
41 #include <cmath>
42 #include <cstdint>
43 #include <iterator>
44 #include <limits>
45 #include <stdexcept>
46 #include <string>
47 #include <type_traits>
48 #include <utility>
49 #include <vector>
50 
51 namespace genesis {
52 namespace population {
53 
54 // =================================================================================================
55 // Diversity Pool Calculator
56 // =================================================================================================
57 
123 {
124 public:
125 
126  // -------------------------------------------------------------------------
127  // Typedefs and Enums
128  // -------------------------------------------------------------------------
129 
139  struct Result
140  {
141  // Results of the diversity statistics
142  double theta_pi = std::numeric_limits<double>::quiet_NaN();
143  double theta_watterson = std::numeric_limits<double>::quiet_NaN();
144  double tajima_d = std::numeric_limits<double>::quiet_NaN();
145  };
146 
147  // -------------------------------------------------------------------------
148  // Constructors and Rule of Five
149  // -------------------------------------------------------------------------
150 
151  DiversityPoolCalculator( DiversityPoolSettings const& settings, size_t pool_size )
152  : settings_( settings )
153  , pool_size_( pool_size )
154  {
155  if( settings.min_count == 0 ) {
156  throw std::invalid_argument(
157  "In DiversityPoolCalculator, settings.min_count == 0 is not allowed."
158  );
159  }
160  if( pool_size == 0 ) {
161  throw std::invalid_argument(
162  "In DiversityPoolCalculator, pool_size == 0 is not allowed."
163  );
164  }
165  }
166  ~DiversityPoolCalculator() = default;
167 
170 
173 
175 
176  // -------------------------------------------------------------------------
177  // Settings
178  // -------------------------------------------------------------------------
179 
181  {
182  only_passing_samples_ = value;
183  return *this;
184  }
185 
186  bool only_passing_samples() const
187  {
188  return only_passing_samples_;
189  }
190 
191  self_type& enable_theta_pi( bool value )
192  {
193  enable_theta_pi_ = value;
194  return *this;
195  }
196 
197  bool enable_theta_pi() const
198  {
199  return enable_theta_pi_;
200  }
201 
203  {
204  enable_theta_watterson_ = value;
205  return *this;
206  }
207 
209  {
210  return enable_theta_watterson_;
211  }
212 
213  self_type& enable_tajima_d( bool value )
214  {
215  enable_tajima_d_ = value;
216  return *this;
217  }
218 
219  bool enable_tajima_d() const
220  {
221  return enable_tajima_d_;
222  }
223 
224  // -------------------------------------------------------------------------
225  // Calculator Functions
226  // -------------------------------------------------------------------------
227 
228  void reset()
229  {
230  theta_pi_sum_.reset();
231  theta_watterson_sum_.reset();
232  filter_stats_.clear();
233  empirical_min_read_depth_ = std::numeric_limits<size_t>::max();
234  }
235 
244  void process( SampleCounts const& sample )
245  {
246  // We first check if we need to process this sample at all.
247  // That is either the case if we process _all_ samples (!only_passing_samples_)
248  // or if we only consider the passing ones and the sample is passing.
249  // We assume that the Variant::status has already been checked before calling this,
250  // for instance by the DiversityPoolProcessor.
251  ++filter_stats_[sample.status.get()];
252  if( ! sample.status.passing() ) {
253  return;
254  }
255 
256  // Now run the calculations that we need.
257  double tp = std::numeric_limits<double>::quiet_NaN();
258  double tw = std::numeric_limits<double>::quiet_NaN();
259  if( enable_theta_pi_ || enable_tajima_d_ ) {
260  tp = theta_pi_pool( settings_, pool_size_, sample );
261  if( std::isfinite( tp )) {
262  theta_pi_sum_ += tp;
263  }
264  // assert( std::isfinite( tp ) );
265  }
266  if( enable_theta_watterson_ || enable_tajima_d_ ) {
267  tw = theta_watterson_pool( settings_, pool_size_, sample );
268  if( std::isfinite( tw )) {
269  theta_watterson_sum_ += tw;
270  }
271  // assert( std::isfinite( tw ) );
272  }
273 
274  // We want to keep track of the minimum read depth of the data that we are processing.
275  // This is only needed when using TajimaDenominatorPolicy::kEmpiricalMinReadDepth,
276  // but cheap enough to just always keep track of here.
277  auto const cov = nucleotide_sum( sample );
278  if( cov > 0 && cov < empirical_min_read_depth_ ) {
279  empirical_min_read_depth_ = cov;
280  }
281  }
282 
291  Result get_result( double window_avg_denom ) const
292  {
293  Result result;
294  if( enable_theta_pi_ ) {
295  result.theta_pi = theta_pi_sum_.get() / window_avg_denom;
296  }
297  if( enable_theta_watterson_ ) {
298  result.theta_watterson = theta_watterson_sum_.get() / window_avg_denom;
299  }
300  if( enable_tajima_d_ ) {
301  // Yet another problem in PoPoolation: For the |W| window size in the denominator
302  // of Tajima's D, they use the number of SNPs in that window, which might or might not
303  // be correct - we will have to figure this out. There is a chance that this is correct,
304  // but it could also be that we want to use the the number of _all_ valid positions
305  // (the ones that passed all filters, including any invariant positions) here again.
306  // For now, we follow their approach, but might leave this to fix later.
307  double const tajimas_window_avg_denom = filter_stats_[SampleCountsFilterTag::kPassed];
308 
309  // Potential fix:
310  // auto tajimas_window_avg_denom = window_avg_denom;
311  // if( settings_.tajima_denominator_policy == TajimaDenominatorPolicy::kWithPopoolationBugs ) {
312  // tajimas_window_avg_denom = filter_stats_[SampleCountsFilterTag::kPassed];
313  // }
314 
315  result.tajima_d = tajima_d_pool(
316  settings_,
317  theta_pi_sum_.get(), theta_watterson_sum_.get(),
318  pool_size_, tajimas_window_avg_denom, empirical_min_read_depth_
319  );
320  }
321  return result;
322  }
323 
331  {
332  return filter_stats_;
333  }
334 
335  // -------------------------------------------------------------------------
336  // Private Members
337  // -------------------------------------------------------------------------
338 
339 private:
340 
341  // Settings
342  DiversityPoolSettings settings_;
343  size_t pool_size_ = 0;
344 
345  bool only_passing_samples_ = true;
346  bool enable_theta_pi_ = true;
347  bool enable_theta_watterson_ = true;
348  bool enable_tajima_d_ = true;
349 
350  // Data Accumulation
351  utils::NeumaierSum theta_pi_sum_;
352  utils::NeumaierSum theta_watterson_sum_;
353  SampleCountsFilterStats filter_stats_;
354 
355  // Find the minimum empirical read depth that we see in the processed data.
356  size_t empirical_min_read_depth_ = std::numeric_limits<size_t>::max();
357 };
358 
359 } // namespace population
360 } // namespace genesis
361 
362 #endif // include guard
genesis::population::DiversityPoolCalculator::Result::theta_watterson
double theta_watterson
Definition: diversity_pool_calculator.hpp:143
genesis::population::DiversityPoolCalculator::reset
void reset()
Definition: diversity_pool_calculator.hpp:228
functions.hpp
genesis::population::DiversityPoolCalculator::only_passing_samples
bool only_passing_samples() const
Definition: diversity_pool_calculator.hpp:186
genesis::population::FilterStatus::get
IntType get() const
Definition: filter_status.hpp:91
genesis::population::DiversityPoolCalculator::process
void process(SampleCounts const &sample)
Process a sample, by computing its Theta Pi and Theta Watterson values, respectively.
Definition: diversity_pool_calculator.hpp:244
genesis::utils::CompensatedSum
Compensated summation algorithmm, such as Kahan, Neumaier, and Klein summation.
Definition: compensated_sum.hpp:49
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
diversity_pool_functions.hpp
genesis::population::DiversityPoolCalculator::get_result
Result get_result(double window_avg_denom) const
Convenience function to obtain all results at once.
Definition: diversity_pool_calculator.hpp:291
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::SampleCounts::status
FilterStatus status
Status to indicate whether any applied filters failed to pass.
Definition: sample_counts.hpp:96
genesis::population::DiversityPoolCalculator::only_passing_samples
self_type & only_passing_samples(bool value)
Definition: diversity_pool_calculator.hpp:180
genesis::population::DiversityPoolCalculator::enable_theta_pi
self_type & enable_theta_pi(bool value)
Definition: diversity_pool_calculator.hpp:191
genesis::population::DiversityPoolCalculator::Result
Data struct to collect all diversity statistics computed here.
Definition: diversity_pool_calculator.hpp:139
genesis::population::DiversityPoolCalculator::Result::theta_pi
double theta_pi
Definition: diversity_pool_calculator.hpp:142
sample_counts_filter.hpp
genesis::population::DiversityPoolCalculator::enable_theta_watterson
bool enable_theta_watterson() const
Definition: diversity_pool_calculator.hpp:208
genesis::utils::CompensatedSum::reset
void reset()
Definition: compensated_sum.hpp:213
genesis::population::SampleCountsFilterTag::kPassed
@ kPassed
Sample has passed all filters.
genesis::population::DiversityPoolCalculator::get_filter_stats
SampleCountsFilterStats get_filter_stats() const
Get the sum of filter statistics of all sample pairs processed here.
Definition: diversity_pool_calculator.hpp:330
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::FilterStatus::passing
bool passing() const
Definition: filter_status.hpp:75
genesis::utils::CompensatedSum::get
double get() const
Definition: compensated_sum.hpp:221
genesis::population::DiversityPoolCalculator::operator=
DiversityPoolCalculator & operator=(DiversityPoolCalculator const &)=default
genesis::population::DiversityPoolCalculator::enable_tajima_d
bool enable_tajima_d() const
Definition: diversity_pool_calculator.hpp:219
genesis::population::DiversityPoolCalculator::enable_theta_pi
bool enable_theta_pi() const
Definition: diversity_pool_calculator.hpp:197
genesis::population::theta_pi_pool
double theta_pi_pool(DiversityPoolSettings const &settings, size_t poolsize, ForwardIterator begin, ForwardIterator end, bool only_passing_samples=true)
Compute theta pi with pool-sequencing correction according to Kofler et al, that is,...
Definition: diversity_pool_functions.hpp:250
genesis::population::DiversityPoolCalculator::Result::tajima_d
double tajima_d
Definition: diversity_pool_calculator.hpp:144
genesis::population::DiversityPoolCalculator::~DiversityPoolCalculator
~DiversityPoolCalculator()=default
genesis::population::DiversityPoolCalculator::enable_tajima_d
self_type & enable_tajima_d(bool value)
Definition: diversity_pool_calculator.hpp:213
genesis::population::DiversityPoolCalculator::enable_theta_watterson
self_type & enable_theta_watterson(bool value)
Definition: diversity_pool_calculator.hpp:202
genesis::population::FilterStats< SampleCountsFilterTag >
genesis::population::theta_watterson_pool
double theta_watterson_pool(DiversityPoolSettings const &settings, size_t poolsize, ForwardIterator begin, ForwardIterator end, bool only_passing_samples=true)
Compute theta watterson with pool-sequencing correction according to Kofler et al.
Definition: diversity_pool_functions.hpp:323
genesis::population::DiversityPoolSettings::min_count
size_t min_count
Definition: diversity_pool_functions.hpp:143
variant.hpp
genesis::population::FilterStats::clear
void clear()
Definition: filter_stats.hpp:175
compensated_sum.hpp
genesis::population::DiversityPoolCalculator::DiversityPoolCalculator
DiversityPoolCalculator(DiversityPoolSettings const &settings, size_t pool_size)
Definition: diversity_pool_calculator.hpp:151
genesis::population::DiversityPoolCalculator
Compute Theta Pi, Theta Watterson, and Tajia's D in their pool-sequencing corrected versions accordin...
Definition: diversity_pool_calculator.hpp:122
genesis::population::DiversityPoolSettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity_pool_functions.hpp:141
genesis::population::tajima_d_pool
double tajima_d_pool(DiversityPoolSettings const &settings, double theta_pi, double theta_watterson, size_t poolsize, double window_avg_denom, size_t empirical_min_read_depth)
Compute the pool-sequencing corrected version of Tajima's D according to Kofler et al.
Definition: diversity_pool_functions.hpp:553