A library for working with phylogenetic and population genetic data.
v0.32.0
diversity_pool_functions.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_DIVERSITY_POOL_FUNCTIONS_H_
2 #define GENESIS_POPULATION_FUNCTION_DIVERSITY_POOL_FUNCTIONS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
38 
39 #include <cassert>
40 #include <cmath>
41 #include <cstdint>
42 #include <iterator>
43 #include <limits>
44 #include <string>
45 #include <type_traits>
46 #include <vector>
47 
48 namespace genesis {
49 namespace population {
50 
51 // =================================================================================================
52 // Diversity Pool Settings
53 // =================================================================================================
54 
62 {
72 
84 
101 
110  kPoolsize,
111 
128 };
129 
142 {
143  size_t min_count = 0;
144  size_t min_read_depth = 0;
145  size_t max_read_depth = 0;
146 
148 };
149 
150 // =================================================================================================
151 // Theta Pi
152 // =================================================================================================
153 
169 double heterozygosity( SampleCounts const& sample, bool with_bessel = false );
170 
177 template<class ForwardIterator>
178 double theta_pi(
179  ForwardIterator begin,
180  ForwardIterator end,
181  bool with_bessel = true,
182  bool only_passing_samples = true
183 ) {
184  double pi_sum = 0.0;
185  for( auto& it = begin; it != end; ++it ) {
186  if( only_passing_samples && !it->status.passing() ) {
187  continue;
188  }
189  pi_sum += heterozygosity( *it, with_bessel );
190  }
191  return pi_sum;
192 }
193 
202 template<class ForwardIterator>
204  size_t poolsize,
205  ForwardIterator begin,
206  ForwardIterator end,
207  bool only_passing_samples = true
208 ) {
209  auto const psb = static_cast<double>( poolsize ) / ( static_cast<double>( poolsize ) - 1.0 );
210  double pi_sum = 0.0;
211  for( auto& it = begin; it != end; ++it ) {
212  if( only_passing_samples && !it->status.passing() ) {
213  continue;
214  }
215 
216  // Apply heterozygosity with Bessel's for nucleotide count, and Bessel's for pool size.
217  pi_sum += heterozygosity( *it, true ) * psb;
218  }
219  return pi_sum;
220 }
221 
233  DiversityPoolSettings const& settings,
234  size_t poolsize,
235  size_t nucleotide_count
236 );
237 
249 template<class ForwardIterator>
250 double theta_pi_pool( // get_pi_calculator
251  DiversityPoolSettings const& settings,
252  size_t poolsize,
253  ForwardIterator begin,
254  ForwardIterator end,
255  bool only_passing_samples = true
256 ) {
257  // PoPoolation variable names:
258  // poolsize: n
259  // min_count: b
260 
261  double pi_sum = 0.0;
262  for( auto& it = begin; it != end; ++it ) {
263  if( only_passing_samples && !it->status.passing() ) {
264  continue;
265  }
266 
267  double const pi_snp = heterozygosity( *it, true );
268  double const denom = theta_pi_pool_denominator(
269  settings, poolsize, nucleotide_sum( *it )
270  );
271  if( std::isfinite( pi_snp ) && std::isfinite( denom )) {
272  pi_sum += ( pi_snp / denom );
273  }
274  }
275  return pi_sum;
276 }
277 
288 inline double theta_pi_pool(
289  DiversityPoolSettings const& settings,
290  size_t poolsize,
291  SampleCounts const& sample
292 ) {
293  auto const h = heterozygosity( sample, true );
294  auto const d = theta_pi_pool_denominator( settings, poolsize, nucleotide_sum( sample ));
295  return h / d;
296 }
297 
298 // =================================================================================================
299 // Theta Watterson
300 // =================================================================================================
301 
311  DiversityPoolSettings const& settings,
312  size_t poolsize,
313  size_t nucleotide_count
314 );
315 
322 template<class ForwardIterator>
323 double theta_watterson_pool( // get_theta_calculator
324  DiversityPoolSettings const& settings,
325  size_t poolsize,
326  ForwardIterator begin,
327  ForwardIterator end,
328  bool only_passing_samples = true
329 ) {
330  // PoPoolation variable names:
331  // poolsize: n
332  // min_count: b
333 
334  double sum = 0.0;
335  for( auto& it = begin; it != end; ++it ) {
336  if( only_passing_samples && !it->status.passing() ) {
337  continue;
338  }
339 
340  auto const denom = theta_watterson_pool_denominator(
341  settings, poolsize, nucleotide_sum( *it )
342  );
343  if( std::isfinite( denom )) {
344  sum += 1.0 / denom;
345  }
346  }
347  return sum;
348 }
349 
356 inline double theta_watterson_pool(
357  DiversityPoolSettings const& settings,
358  size_t poolsize,
359  SampleCounts const& sample
360 ) {
361  return 1.0 / theta_watterson_pool_denominator( settings, poolsize, nucleotide_sum( sample ));
362 }
363 
364 // =================================================================================================
365 // Tajima's D Helper Functions
366 // =================================================================================================
367 
391 double a_n( double n );
392 
416 double b_n( double n );
417 
432 double f_star( double a_n, double n );
433 
459 double alpha_star( double n );
460 
468 double beta_star( double n );
469 
495 double n_base_matrix( size_t read_depth, size_t poolsize );
496 
510 double n_base( size_t read_depth, size_t poolsize );
511 
512 // =================================================================================================
513 // Tajima's D
514 // =================================================================================================
515 
539  DiversityPoolSettings const& settings,
540  double theta,
541  size_t poolsize,
542  double window_avg_denom,
543  size_t empirical_min_read_depth
544 );
545 
553 inline double tajima_d_pool(
554  DiversityPoolSettings const& settings,
555  double theta_pi,
556  double theta_watterson,
557  size_t poolsize,
558  double window_avg_denom,
559  size_t empirical_min_read_depth
560 ) {
561  // Edge case, following what PoPoolation does in this situation.
562  // Deactivated - we want a nan instead.
563  // if( window_avg_denom == 0 ) {
564  // return 0.0;
565  // }
566 
567  // We already have the two theta statistics given here, but need to compute the
568  // denominator according to Kofler et al for pooled sequences.
569  auto const denom = tajima_d_pool_denominator(
570  settings, theta_watterson, poolsize, window_avg_denom, empirical_min_read_depth
571  );
572  return ( theta_pi - theta_watterson ) / denom;
573 }
574 
584 template<class ForwardIterator>
585 double tajima_d_pool( // get_D_calculator
586  DiversityPoolSettings const& settings,
587  double theta_pi,
588  double theta_watterson,
589  size_t poolsize,
590  ForwardIterator begin,
591  ForwardIterator end,
592  bool only_passing_samples = true
593 ) {
594  // If we need the empirical min read depth, compute it.
595  // If not, we can skip this step.
596  size_t empirical_min_read_depth = std::numeric_limits<size_t>::max();
597  auto entry_count = std::distance( begin, end );
598  if(
599  only_passing_samples ||
601  ) {
602  entry_count = 0;
603  for( auto& it = begin; it != end; ++it ) {
604  if( only_passing_samples && !it->status.passing() ) {
605  continue;
606  }
607 
608  auto const cov = nucleotide_sum( *it );
609  if( cov < empirical_min_read_depth ) {
610  empirical_min_read_depth = cov;
611  }
612  ++entry_count;
613  }
614  }
615 
616  return tajima_d_pool(
617  settings, theta_pi, theta_watterson, poolsize, entry_count, empirical_min_read_depth
618  );
619 }
620 
631 template<class ForwardIterator>
632 double tajima_d_pool( // get_D_calculator
633  DiversityPoolSettings const& settings,
634  size_t poolsize,
635  ForwardIterator begin,
636  ForwardIterator end,
637  bool only_passing_samples = true
638 ) {
639  // First compute the two theta statistics, then call the other version of this function.
640  auto const pi = theta_pi_pool( settings, poolsize, begin, end, only_passing_samples );
641  auto const tw = theta_watterson_pool( settings, poolsize, begin, end, only_passing_samples );
642  return tajima_d_pool( settings, pi, tw, poolsize, begin, end, only_passing_samples );
643 }
644 
645 } // namespace population
646 } // namespace genesis
647 
648 #endif // include guard
genesis::population::DiversityPoolSettings::tajima_denominator_policy
TajimaDenominatorPolicy tajima_denominator_policy
Definition: diversity_pool_functions.hpp:147
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
functions.hpp
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::theta_watterson_pool_denominator
double theta_watterson_pool_denominator(DiversityPoolSettings const &settings, size_t poolsize, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta watterson according to Kofler et ...
Definition: diversity_pool_functions.cpp:231
genesis::population::f_star
double f_star(double a_n, double n)
Compute f* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity_pool_functions.cpp:324
genesis::population::theta_pi_pool_denominator
double theta_pi_pool_denominator(DiversityPoolSettings const &settings, size_t poolsize, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta pi according to Kofler et al.
Definition: diversity_pool_functions.cpp:172
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::TajimaDenominatorPolicy::kPoolsize
@ kPoolsize
Instead of using n_base() to obtain the number of individuals sequenced (empirical pool size),...
genesis::population::DiversityPoolSettings::max_read_depth
size_t max_read_depth
Definition: diversity_pool_functions.hpp:145
genesis::population::TajimaDenominatorPolicy::kUncorrected
@ kUncorrected
Do not correct Tajima's D at all.
sample_counts_filter.hpp
genesis::population::DiversityPoolSettings::min_read_depth
size_t min_read_depth
Definition: diversity_pool_functions.hpp:144
genesis::population::n_base
double n_base(size_t read_depth, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, using a faster closed form express...
Definition: diversity_pool_functions.cpp:501
genesis::population::TajimaDenominatorPolicy::kEmpiricalMinReadDepth
@ kEmpiricalMinReadDepth
Use the empirical minimum read depth found in each window to compute the expected number of individua...
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::b_n
double b_n(double n)
Compute b_n, the sum of squared reciprocals.
Definition: diversity_pool_functions.cpp:307
genesis::population::n_base_matrix
double n_base_matrix(size_t read_depth, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, following their approach.
Definition: diversity_pool_functions.cpp:467
variant_filter.hpp
genesis::population::theta_pi_pool
double theta_pi_pool(DiversityPoolSettings const &settings, size_t poolsize, ForwardIterator begin, ForwardIterator end, bool only_passing_samples=true)
Compute theta pi with pool-sequencing correction according to Kofler et al, that is,...
Definition: diversity_pool_functions.hpp:250
genesis::population::TajimaDenominatorPolicy
TajimaDenominatorPolicy
Select how to compute the denominator for the pool sequencing correction of Tajima's D.
Definition: diversity_pool_functions.hpp:61
genesis::population::beta_star
double beta_star(double n)
Compute beta* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity_pool_functions.cpp:358
genesis::population::theta_watterson_pool
double theta_watterson_pool(DiversityPoolSettings const &settings, size_t poolsize, ForwardIterator begin, ForwardIterator end, bool only_passing_samples=true)
Compute theta watterson with pool-sequencing correction according to Kofler et al.
Definition: diversity_pool_functions.hpp:323
genesis::population::DiversityPoolSettings::min_count
size_t min_count
Definition: diversity_pool_functions.hpp:143
variant.hpp
genesis::population::a_n
double a_n(double n)
Compute a_n, the sum of reciprocals.
Definition: diversity_pool_functions.cpp:285
genesis::population::theta_pi
double theta_pi(ForwardIterator begin, ForwardIterator end, bool with_bessel=true, bool only_passing_samples=true)
Compute classic theta pi, that is, the sum of heterozygosities.
Definition: diversity_pool_functions.hpp:178
genesis::population::TajimaDenominatorPolicy::kWithPopoolationBugs
@ kWithPopoolationBugs
Replicate the original behaviour of PoPoolation <= 1.2.2.
genesis::population::alpha_star
double alpha_star(double n)
Compute alpha* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity_pool_functions.cpp:330
genesis::population::TajimaDenominatorPolicy::kProvidedMinReadDepth
@ kProvidedMinReadDepth
Fix the bugs of the original PoPoolation, but still use their way of computing the empirical pool siz...
genesis::population::tajima_d_pool_denominator
double tajima_d_pool_denominator(DiversityPoolSettings const &settings, double theta, size_t poolsize, double window_avg_denom, size_t empirical_min_read_depth)
Compute the denominator for the pool-sequencing correction of Tajima's D according to Kofler et al.
Definition: diversity_pool_functions.cpp:520
genesis::population::DiversityPoolSettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity_pool_functions.hpp:141
genesis::population::theta_pi_within_pool
double theta_pi_within_pool(size_t poolsize, ForwardIterator begin, ForwardIterator end, bool only_passing_samples=true)
Compute classic theta pi (within a population), that is, the sum of heterozygosities including Bessel...
Definition: diversity_pool_functions.hpp:203
genesis::population::tajima_d_pool
double tajima_d_pool(DiversityPoolSettings const &settings, double theta_pi, double theta_watterson, size_t poolsize, double window_avg_denom, size_t empirical_min_read_depth)
Compute the pool-sequencing corrected version of Tajima's D according to Kofler et al.
Definition: diversity_pool_functions.hpp:553
genesis::population::heterozygosity
double heterozygosity(SampleCounts const &sample, bool with_bessel)
Compute classic heterozygosity.
Definition: diversity_pool_functions.cpp:150