A library for working with phylogenetic and population genetic data.
v0.27.0
diversity.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTIONS_DIVERSITY_H_
2 #define GENESIS_POPULATION_FUNCTIONS_DIVERSITY_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
40 
41 #include <cassert>
42 #include <cmath>
43 #include <cstdint>
44 #include <iterator>
45 #include <string>
46 #include <type_traits>
47 #include <vector>
48 
49 namespace genesis {
50 namespace population {
51 
52 // =================================================================================================
53 // Diversity Estimates
54 // =================================================================================================
55 
75 {
76  size_t poolsize = 0;
77 
78  size_t min_allele_count = 0;
79  size_t min_coverage = 0;
80  size_t max_coverage = 0;
81 
82  bool with_popoolation_bugs = false;
83 };
84 
93 {
94  double theta_pi_absolute = 0;
95  double theta_pi_relative = 0;
98  double tajima_d = 0;
99 
104  size_t variant_count = 0;
105 
114  size_t coverage_count = 0;
115 
123  size_t snp_count = 0;
124 };
125 
141 double heterozygosity( BaseCounts const& sample, bool with_bessel = false );
142 
143 // =================================================================================================
144 // Theta Pi
145 // =================================================================================================
146 
167  PoolDiversitySettings const& settings,
168  size_t nucleotide_count
169 );
170 
177 template<class ForwardIterator>
178 double theta_pi(
179  ForwardIterator begin, ForwardIterator end,
180  bool with_bessel = true
181 ) {
182  double pi_sum = 0.0;
183  for( auto& it = begin; it != end; ++it ) {
184  pi_sum += heterozygosity( *it, with_bessel );
185  }
186  return pi_sum;
187 }
188 
198 template<class ForwardIterator>
199 double theta_pi_pool( // get_pi_calculator
200  PoolDiversitySettings const& settings,
201  ForwardIterator begin,
202  ForwardIterator end
203 ) {
204  // PoPoolation variable names:
205  // poolsize: n
206  // min_allele_count: b
207 
208  double pi_sum = 0.0;
209  for( auto& it = begin; it != end; ++it ) {
210  double const pi_snp = heterozygosity( *it, true );
211  double const denom = theta_pi_pool_denominator( settings, nucleotide_sum( *it ));
212  pi_sum += ( pi_snp / denom );
213  }
214  return pi_sum;
215 }
216 
222 inline double theta_pi_pool(
223  PoolDiversitySettings const& settings,
224  BaseCounts const& sample
225 ) {
226  auto const h = heterozygosity( sample, true );
227  auto const d = theta_pi_pool_denominator( settings, nucleotide_sum( sample ));
228  return h / d;
229 }
230 
239 template<class ForwardIterator>
241  ForwardIterator begin, ForwardIterator end,
242  size_t poolsize
243 ) {
244  auto const psb = static_cast<double>( poolsize ) / ( static_cast<double>( poolsize ) - 1.0 );
245  double pi_sum = 0.0;
246  for( auto& it = begin; it != end; ++it ) {
247  // Apply heterozygosity with Bessel's for nucleotide count, and Bessel's for pool size.
248  pi_sum += heterozygosity( *it, true ) * psb;
249  }
250  return pi_sum;
251 }
252 
253 // =================================================================================================
254 // Theta Watterson
255 // =================================================================================================
256 
264  PoolDiversitySettings const& settings,
265  size_t nucleotide_count
266 );
267 
271 template<class ForwardIterator>
272 double theta_watterson_pool( // get_theta_calculator
273  PoolDiversitySettings const& settings,
274  ForwardIterator begin,
275  ForwardIterator end
276 ) {
277  // PoPoolation variable names:
278  // poolsize: n
279  // min_allele_count: b
280 
281  double sum = 0.0;
282  for( auto& it = begin; it != end; ++it ) {
283  sum += 1.0 / theta_watterson_pool_denominator( settings, nucleotide_sum( *it ));
284  }
285  return sum;
286 }
287 
288 // =================================================================================================
289 // Tajima's D
290 // =================================================================================================
291 
307 double a_n( size_t n );
308 
329 double b_n( size_t n );
330 
343 double f_star( double a_n, double n );
344 
368 double alpha_star( double n );
369 
375 double beta_star( double n );
376 
400 double n_base_matrix( size_t coverage, size_t poolsize );
401 
413 double n_base( size_t coverage, size_t poolsize );
414 
420  PoolDiversitySettings const& settings,
421  size_t snp_count,
422  double theta
423 );
424 
429 template<class ForwardIterator>
430 double tajima_d_pool( // get_D_calculator
431  PoolDiversitySettings const& settings,
432  ForwardIterator begin,
433  ForwardIterator end,
434  double theta_pi,
435  double theta_watterson
436 ) {
437  // Edge case, following what PoPoolation does in this situation.
438  if( begin == end ) {
439  return 0.0;
440  }
441 
442  // We already have the two theta statistics given here, but need to compute the
443  // denominator according to Kofler et al for pooled sequences.
444  auto const snp_cnt = std::distance( begin, end );
445  auto const denom = tajima_d_pool_denominator( settings, snp_cnt, theta_watterson );
446  return ( theta_pi - theta_watterson ) / denom;
447 }
448 
453 template<class ForwardIterator>
454 double tajima_d_pool( // get_D_calculator
455  PoolDiversitySettings const& settings,
456  ForwardIterator begin,
457  ForwardIterator end
458 ) {
459  // First compute the two theta statistics, then call the other version of this function.
460  auto const pi = theta_pi_pool( settings, begin, end );
461  auto const tw = theta_watterson_pool( settings, begin, end );
462  return tajima_d_pool( settings, begin, end, pi, tw );
463 }
464 
465 // =================================================================================================
466 // Diversity Measures
467 // =================================================================================================
468 
483 template<class ForwardIterator>
485  PoolDiversitySettings const& settings,
486  ForwardIterator begin,
487  ForwardIterator end
488 ) {
489  PoolDiversityResults results;
490 
491  // Filter all counts that are below the given min count.
492  // The following is inefficient, as we do the transform multiple times in all the loops
493  // below (note that all statistics functions also loop at least once!). But this function
494  // here is meant as a high level function for simplicity, and we have to live with that.
495  // Better solution is to run transform_zero_out_by_min_count() already when adding the samples
496  // to the range that we loop over here.
497  auto min_filtered_range = utils::make_transform_range(
498  [&]( BaseCounts const& sample ){
499  auto copy = sample;
501  return copy;
502  },
503  begin, end
504  );
505 
506  // Count how many SNPs there are in total, and how many sites have the needed coverage.
507  for( auto it = min_filtered_range.begin(); it != min_filtered_range.end(); ++it ) {
508  auto const stat = status(
509  *it, settings.min_coverage, settings.max_coverage, settings.min_allele_count
510  );
511 
512  // Make sure that we are dealing with the correct types here.
513  // (Just in case we ever refactor the class where these values are stored.)
514  static_assert(
515  std::is_same<decltype(stat.is_snp), bool>::value,
516  "Expect bool type for BaseCountsStatus::is_snp"
517  );
518  static_assert(
519  std::is_same<decltype(stat.is_covered), bool>::value,
520  "Expect bool type for BaseCountsStatus::is_covered"
521  );
522  static_assert( static_cast<size_t>( true ) == 1, "Expect true == 1" );
523  static_assert( static_cast<size_t>( false ) == 0, "Expect false == 0" );
524 
525  // Add them up.
526  ++results.variant_count;
527  results.coverage_count += static_cast<size_t>( stat.is_covered );
528  results.snp_count += static_cast<size_t>( stat.is_snp );
529  }
530 
531  // Make a filter that only allows samples that are SNPs and have the needed coverage.
532  auto covered_snps_range = utils::make_filter_range( [&]( BaseCounts const& sample ){
533  auto stat = status(
534  sample, settings.min_coverage, settings.max_coverage, settings.min_allele_count
535  );
536  return stat.is_covered && stat.is_snp;
537  }, min_filtered_range.begin(), min_filtered_range.end() );
538 
539  // Compute all values.
541  settings, covered_snps_range.begin(), covered_snps_range.end()
542  );
544  settings, covered_snps_range.begin(), covered_snps_range.end()
545  );
546  results.tajima_d = tajima_d_pool(
547  settings, covered_snps_range.begin(), covered_snps_range.end(),
549  );
550 
551  // Compute the relative versions of the values.
552  auto const coverage = static_cast<double>( results.coverage_count );
553  results.theta_pi_relative = results.theta_pi_absolute / coverage;
554  results.theta_watterson_relative = results.theta_watterson_absolute / coverage;
555 
556  return results;
557 }
558 
559 } // namespace population
560 } // namespace genesis
561 
562 #endif // include guard
genesis::population::theta_pi_pool
double theta_pi_pool(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end)
Compute theta pi with pool-sequencing correction according to Kofler et al, that is,...
Definition: diversity.hpp:199
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::population::a_n
double a_n(size_t n)
Compute a_n, the sum of reciprocals.
Definition: diversity.cpp:231
genesis::population::heterozygosity
double heterozygosity(BaseCounts const &sample, bool with_bessel)
Compute classic heterozygosity.
Definition: diversity.cpp:110
genesis::population::PoolDiversitySettings::min_coverage
size_t min_coverage
Definition: diversity.hpp:79
genesis::population::f_star
double f_star(double a_n, double n)
Compute f* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:257
genesis::population::PoolDiversitySettings::min_allele_count
size_t min_allele_count
Definition: diversity.hpp:78
genesis::population::tajima_d_pool_denominator
double tajima_d_pool_denominator(PoolDiversitySettings const &settings, size_t snp_count, double theta)
Compute the denominator for the pool-sequencing correction of Tajima's D according to Kofler et al.
Definition: diversity.cpp:451
filter_iterator.hpp
genesis::population::PoolDiversityResults::tajima_d
double tajima_d
Definition: diversity.hpp:98
genesis::population::PoolDiversityResults::theta_watterson_relative
double theta_watterson_relative
Definition: diversity.hpp:97
genesis::population::PoolDiversitySettings::max_coverage
size_t max_coverage
Definition: diversity.hpp:80
genesis::population::PoolDiversitySettings::poolsize
size_t poolsize
Definition: diversity.hpp:76
genesis::population::theta_watterson_pool
double theta_watterson_pool(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end)
Compute theta watterson with pool-sequencing correction according to Kofler et al.
Definition: diversity.hpp:272
genesis::population::n_base_matrix
double n_base_matrix(size_t coverage, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, following their approach.
Definition: diversity.cpp:398
genesis::population::theta_watterson_pool_denominator
double theta_watterson_pool_denominator(PoolDiversitySettings const &settings, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta watterson according to Kofler et ...
Definition: diversity.cpp:186
genesis::population::pool_diversity_measures
PoolDiversityResults pool_diversity_measures(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end)
Compute Theta Pi, Theta Watterson, and Tajia's D in their pool-sequencing corrected versions accordin...
Definition: diversity.hpp:484
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/functions/functions.hpp:222
matrix.hpp
genesis::population::b_n
double b_n(size_t n)
Compute b_n, the sum of squared reciprocals.
Definition: diversity.cpp:244
genesis::population::n_base
double n_base(size_t coverage, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, using a faster closed form express...
Definition: diversity.cpp:432
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::PoolDiversityResults::theta_watterson_absolute
double theta_watterson_absolute
Definition: diversity.hpp:96
transform_iterator.hpp
genesis::population::status
BaseCountsStatus status(BaseCounts const &sample, size_t min_coverage, size_t max_coverage, size_t min_count, bool tolerate_deletions)
Compute a simple status with useful properties from the counts of a BaseCounts.
Definition: population/functions/functions.cpp:49
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:54
genesis::utils::make_filter_range
Range< FilterIterator< PredicateFunctor, BaseIterator > > make_filter_range(PredicateFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a filtering range, given the filter predicate function as well as the underlying base itera...
Definition: filter_iterator.hpp:285
genesis::population::PoolDiversityResults::variant_count
size_t variant_count
Count of variants in the iterator range that surpass the PoolDiversitySettings::min_allele_count sett...
Definition: diversity.hpp:104
genesis::population::beta_star
double beta_star(double n)
Compute beta* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:291
genesis::population::theta_pi_pool_denominator
double theta_pi_pool_denominator(PoolDiversitySettings const &settings, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta pi according to Kofler et al.
Definition: diversity.cpp:136
genesis::population::PoolDiversityResults::coverage_count
size_t coverage_count
Out of the variant_count many variants, how many are properly covered, that is, have coverage in betw...
Definition: diversity.hpp:114
genesis::population::PoolDiversitySettings::with_popoolation_bugs
bool with_popoolation_bugs
Definition: diversity.hpp:82
genesis::population::tajima_d_pool
double tajima_d_pool(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end, double theta_pi, double theta_watterson)
Compute the pool-sequencing corrected version of Tajima's D according to Kofler et al.
Definition: diversity.hpp:430
genesis::population::theta_pi
double theta_pi(ForwardIterator begin, ForwardIterator end, bool with_bessel=true)
Compute classic theta pi, that is, the sum of heterozygosities.
Definition: diversity.hpp:178
genesis::population::transform_zero_out_by_min_count
void transform_zero_out_by_min_count(BaseCounts &sample, size_t min_count)
Transform a BaseCounts sample by setting any nucleotide count (A, C, G, T) to zero if min_count is no...
Definition: filter_transform.cpp:77
genesis::population::PoolDiversityResults::theta_pi_absolute
double theta_pi_absolute
Definition: diversity.hpp:94
genesis::population::PoolDiversityResults::snp_count
size_t snp_count
Out of the variant_count and coverage_count many variants, how many are SNPs, that is,...
Definition: diversity.hpp:123
genesis::utils::make_transform_range
Range< TransformIterator< TransformFunctor, BaseIterator > > make_transform_range(TransformFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a transforming range, given the transformation function as well as the underlying base iter...
Definition: transform_iterator.hpp:438
variant.hpp
genesis::population::theta_pi_within_pool
double theta_pi_within_pool(ForwardIterator begin, ForwardIterator end, size_t poolsize)
Compute classic theta pi (within a population), that is, the sum of heterozygosities including Bessel...
Definition: diversity.hpp:240
functions.hpp
genesis::population::alpha_star
double alpha_star(double n)
Compute alpha* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:263
genesis::population::PoolDiversityResults::theta_pi_relative
double theta_pi_relative
Definition: diversity.hpp:95
genesis::population::PoolDiversitySettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity.hpp:74
filter_transform.hpp
genesis::population::PoolDiversityResults
Data struct to collect all diversity statistics computed by pool_diversity_measures().
Definition: diversity.hpp:92