A library for working with phylogenetic data.
v0.25.0
diversity.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTIONS_DIVERSITY_H_
2 #define GENESIS_POPULATION_FUNCTIONS_DIVERSITY_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
40 
41 #include <cassert>
42 #include <cmath>
43 #include <cstdint>
44 #include <iterator>
45 #include <string>
46 #include <type_traits>
47 #include <vector>
48 
49 namespace genesis {
50 namespace population {
51 
52 // =================================================================================================
53 // Diversity Estimates
54 // =================================================================================================
55 
74 {
75  size_t window_width = 0;
76  size_t window_stride = 0;
77  size_t min_phred_score = 0;
78 
79  size_t poolsize = 0;
80  size_t min_allele_count = 0;
81  size_t min_coverage = 0;
82  size_t max_coverage = 0;
83  double min_coverage_fraction = 0.0;
84  bool with_popoolation_bugs = false;
85 };
86 
91 {
92  double theta_pi_absolute = 0;
93  double theta_pi_relative = 0;
96  double tajima_d = 0;
97 
98  size_t variant_count = 0;
99  size_t snp_count = 0;
100  size_t coverage_count = 0;
101  double coverage_fraction = 0.0;
102 };
103 
118 double heterozygosity( BaseCounts const& sample );
119 
120 // =================================================================================================
121 // Theta Pi
122 // =================================================================================================
123 
144  PoolDiversitySettings const& settings,
145  size_t nucleotide_count
146 );
147 
153 template<class ForwardIterator>
154 double theta_pi(
155  ForwardIterator begin, ForwardIterator end
156 ) {
157  double pi_sum = 0.0;
158  for( auto& it = begin; it != end; ++it ) {
159  pi_sum += heterozygosity( *it );
160  }
161  return pi_sum;
162 }
163 
172 template<class ForwardIterator>
173 double theta_pi_pool( // get_pi_calculator
174  PoolDiversitySettings const& settings,
175  ForwardIterator begin,
176  ForwardIterator end
177 ) {
178  // PoPoolation variable names:
179  // poolsize: n
180  // min_allele_count: b
181 
182  double pi_sum = 0.0;
183  for( auto& it = begin; it != end; ++it ) {
184  double pi_snp = heterozygosity( *it );
185  pi_snp /= theta_pi_pool_denominator( settings, nucleotide_sum( *it ));
186  pi_sum += pi_snp;
187  }
188  return pi_sum;
189 }
190 
195 inline double theta_pi_pool(
196  PoolDiversitySettings const& settings,
197  BaseCounts const& sample
198 ) {
199  return heterozygosity( sample ) / theta_pi_pool_denominator( settings, nucleotide_sum( sample ));
200 }
201 
202 // =================================================================================================
203 // Theta Watterson
204 // =================================================================================================
205 
213  PoolDiversitySettings const& settings,
214  size_t nucleotide_count
215 );
216 
220 template<class ForwardIterator>
221 double theta_watterson_pool( // get_theta_calculator
222  PoolDiversitySettings const& settings,
223  ForwardIterator begin,
224  ForwardIterator end
225 ) {
226  // PoPoolation variable names:
227  // poolsize: n
228  // min_allele_count: b
229 
230  double sum = 0.0;
231  for( auto& it = begin; it != end; ++it ) {
232  sum += 1.0 / theta_watterson_pool_denominator( settings, nucleotide_sum( *it ));
233  }
234  return sum;
235 }
236 
237 // =================================================================================================
238 // Tajima's D
239 // =================================================================================================
240 
253 double a_n( size_t n );
254 
272 double b_n( size_t n );
273 
286 double f_star( double a_n, double n );
287 
311 double alpha_star( double n );
312 
318 double beta_star( double n );
319 
343 double n_base_matrix( size_t coverage, size_t poolsize );
344 
356 double n_base( size_t coverage, size_t poolsize );
357 
363  PoolDiversitySettings const& settings,
364  size_t snp_count,
365  double theta
366 );
367 
372 template<class ForwardIterator>
373 double tajima_d_pool( // get_D_calculator
374  PoolDiversitySettings const& settings,
375  ForwardIterator begin,
376  ForwardIterator end,
377  double theta_pi,
378  double theta_watterson
379 ) {
380  // Edge case, following what PoPoolation does in this situation.
381  if( begin == end ) {
382  return 0.0;
383  }
384 
385  // We already have the two theta statistics given here, but need to compute the
386  // denominator according to Kofler et al for pooled sequences.
387  auto const snp_cnt = std::distance( begin, end );
388  auto const denom = tajima_d_pool_denominator( settings, snp_cnt, theta_watterson );
389  return ( theta_pi - theta_watterson ) / denom;
390 }
391 
396 template<class ForwardIterator>
397 double tajima_d_pool( // get_D_calculator
398  PoolDiversitySettings const& settings,
399  ForwardIterator begin,
400  ForwardIterator end
401 ) {
402  // First compute the two theta statistics, then call the other version of this function.
403  auto const pi = theta_pi_pool( settings, begin, end );
404  auto const tw = theta_watterson_pool( settings, begin, end );
405  return tajima_d_pool( settings, begin, end, pi, tw );
406 }
407 
408 // =================================================================================================
409 // Diversity Measures
410 // =================================================================================================
411 
426 template<class ForwardIterator>
428  PoolDiversitySettings const& settings,
429  ForwardIterator begin,
430  ForwardIterator end
431 ) {
432  PoolDiversityResults results;
433 
434  // Filter all counts that are below the given min count.
435  // The following is inefficient, as we do the transform multiple times in all the loops
436  // below (note that all statistics functions also loop at least once!). But this function
437  // here is meant as a high level variant, and we have to live with that.
438  // Better solution is to run filter_min_count() already when adding the samples
439  // to the range that we loop over here.
440  auto min_filtered_range = utils::make_transform_range(
441  [&]( BaseCounts const& sample ){
442  auto copy = sample;
443  filter_min_count( copy, settings.min_allele_count );
444  return copy;
445  },
446  begin, end
447  );
448 
449  // Count how many SNPs there are in total, and how many sites have the needed coverage.
450  for( auto it = min_filtered_range.begin(); it != min_filtered_range.end(); ++it ) {
451  auto const stat = status(
452  *it, settings.min_coverage, settings.max_coverage, settings.min_allele_count
453  );
454 
455  // Make sure that we are dealing with the correct types here.
456  // (Just in case we ever refactor the class where these values are stored.)
457  static_assert(
458  std::is_same<decltype(stat.is_snp), bool>::value,
459  "Expect bool type for BaseCountsStatus::is_snp"
460  );
461  static_assert(
462  std::is_same<decltype(stat.is_covered), bool>::value,
463  "Expect bool type for BaseCountsStatus::is_covered"
464  );
465  static_assert( static_cast<size_t>( true ) == 1, "Expect true == 1" );
466  static_assert( static_cast<size_t>( false ) == 0, "Expect false == 0" );
467 
468  // Add them up.
469  ++results.variant_count;
470  results.snp_count += static_cast<size_t>( stat.is_snp );
471  results.coverage_count += static_cast<size_t>( stat.is_covered );
472  }
473 
474  // Compute coverage over the given range.
475  auto const coverage = static_cast<double>( results.coverage_count );
476  results.coverage_fraction = coverage / static_cast<double>( settings.window_width );
477 
478  // Make a filter that only allows samples that are SNPs and have the needed coverage.
479  auto covered_snps_range = utils::make_filter_range( [&]( BaseCounts const& sample ){
480  auto stat = status(
481  sample, settings.min_coverage, settings.max_coverage, settings.min_allele_count
482  );
483  return stat.is_snp && stat.is_covered;
484  }, min_filtered_range.begin(), min_filtered_range.end() );
485 
486  // Compute all values.
488  settings, covered_snps_range.begin(), covered_snps_range.end()
489  );
491  settings, covered_snps_range.begin(), covered_snps_range.end()
492  );
493  results.tajima_d = tajima_d_pool(
494  settings, covered_snps_range.begin(), covered_snps_range.end(),
496  );
497  results.theta_pi_relative = results.theta_pi_absolute / coverage;
498  results.theta_watterson_relative = results.theta_watterson_absolute / coverage;
499 
500  return results;
501 }
502 
503 } // namespace population
504 } // namespace genesis
505 
506 #endif // include guard
variant.hpp
genesis::population::theta_pi_pool
double theta_pi_pool(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end)
Compute theta pi with pool-sequencing correction according to Kofler et al, that is,...
Definition: diversity.hpp:173
genesis::population::PoolDiversitySettings::min_coverage_fraction
double min_coverage_fraction
Definition: diversity.hpp:83
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::population::a_n
double a_n(size_t n)
Compute a_n, the sum of reciprocals.
Definition: diversity.cpp:215
genesis::population::PoolDiversitySettings::min_coverage
size_t min_coverage
Definition: diversity.hpp:81
genesis::population::f_star
double f_star(double a_n, double n)
Compute f* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:241
genesis::population::filter_min_count
void filter_min_count(BaseCounts &sample, size_t min_count)
Filter by minimum count that we need for a type of nucleotide (A, C, G, T) to be considered; set to z...
Definition: base_counts.cpp:174
genesis::population::PoolDiversitySettings::min_allele_count
size_t min_allele_count
Definition: diversity.hpp:80
genesis::population::tajima_d_pool_denominator
double tajima_d_pool_denominator(PoolDiversitySettings const &settings, size_t snp_count, double theta)
Compute the denominator for the pool-sequencing correction of Tajima's D according to Kofler et al.
Definition: diversity.cpp:435
filter_iterator.hpp
genesis::population::PoolDiversityResults::tajima_d
double tajima_d
Definition: diversity.hpp:96
genesis::population::PoolDiversitySettings::window_width
size_t window_width
Definition: diversity.hpp:75
genesis::population::heterozygosity
double heterozygosity(BaseCounts const &sample)
Compute classic heterozygosity.
Definition: diversity.cpp:96
genesis::population::PoolDiversityResults::theta_watterson_relative
double theta_watterson_relative
Definition: diversity.hpp:95
genesis::population::PoolDiversitySettings::max_coverage
size_t max_coverage
Definition: diversity.hpp:82
genesis::population::PoolDiversitySettings::poolsize
size_t poolsize
Definition: diversity.hpp:79
genesis::population::PoolDiversitySettings::window_stride
size_t window_stride
Definition: diversity.hpp:76
genesis::population::theta_watterson_pool
double theta_watterson_pool(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end)
Compute theta watterson with pool-sequencing correction according to Kofler et al.
Definition: diversity.hpp:221
genesis::population::n_base_matrix
double n_base_matrix(size_t coverage, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, following their approach.
Definition: diversity.cpp:382
genesis::population::theta_watterson_pool_denominator
double theta_watterson_pool_denominator(PoolDiversitySettings const &settings, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta watterson according to Kofler et ...
Definition: diversity.cpp:170
genesis::population::pool_diversity_measures
PoolDiversityResults pool_diversity_measures(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end)
Compute Theta Pi, Theta Watterson, and Tajia's D in their pool-sequencing corrected versions accordin...
Definition: diversity.hpp:427
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: functions/base_counts.hpp:177
matrix.hpp
genesis::population::b_n
double b_n(size_t n)
Compute b_n, the sum of squared reciprocals.
Definition: diversity.cpp:228
genesis::population::n_base
double n_base(size_t coverage, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, using a faster closed form express...
Definition: diversity.cpp:416
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::PoolDiversityResults::theta_watterson_absolute
double theta_watterson_absolute
Definition: diversity.hpp:94
genesis::population::theta_pi
double theta_pi(ForwardIterator begin, ForwardIterator end)
Compute classic theta pi, that is, the sum of heterozygosities.
Definition: diversity.hpp:154
base_counts.hpp
transform_iterator.hpp
genesis::population::status
BaseCountsStatus status(BaseCounts const &sample, size_t min_coverage, size_t max_coverage, size_t min_count, bool tolerate_deletions)
Compute a simple status with useful properties from the counts of a BaseCounts.
Definition: base_counts.cpp:46
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:53
genesis::utils::make_filter_range
Range< FilterIterator< PredicateFunctor, BaseIterator > > make_filter_range(PredicateFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a filtering range, given the filter predicate function as well as the underlying base itera...
Definition: filter_iterator.hpp:285
genesis::population::PoolDiversityResults::variant_count
size_t variant_count
Definition: diversity.hpp:98
genesis::population::beta_star
double beta_star(double n)
Compute beta* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:275
genesis::population::theta_pi_pool_denominator
double theta_pi_pool_denominator(PoolDiversitySettings const &settings, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta pi according to Kofler et al.
Definition: diversity.cpp:120
genesis::population::PoolDiversityResults::coverage_fraction
double coverage_fraction
Definition: diversity.hpp:101
genesis::population::PoolDiversityResults::coverage_count
size_t coverage_count
Definition: diversity.hpp:100
genesis::population::PoolDiversitySettings::with_popoolation_bugs
bool with_popoolation_bugs
Definition: diversity.hpp:84
genesis::population::tajima_d_pool
double tajima_d_pool(PoolDiversitySettings const &settings, ForwardIterator begin, ForwardIterator end, double theta_pi, double theta_watterson)
Compute the pool-sequencing corrected version of Tajima's D according to Kofler et al.
Definition: diversity.hpp:373
genesis::population::PoolDiversityResults::theta_pi_absolute
double theta_pi_absolute
Definition: diversity.hpp:92
genesis::population::PoolDiversityResults::snp_count
size_t snp_count
Definition: diversity.hpp:99
genesis::utils::make_transform_range
Range< TransformIterator< TransformFunctor, BaseIterator > > make_transform_range(TransformFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a transforming range, given the transformation function as well as the underlying base iter...
Definition: transform_iterator.hpp:438
variant.hpp
genesis::population::PoolDiversitySettings::min_phred_score
size_t min_phred_score
Definition: diversity.hpp:77
genesis::population::alpha_star
double alpha_star(double n)
Compute alpha* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:247
genesis::population::PoolDiversityResults::theta_pi_relative
double theta_pi_relative
Definition: diversity.hpp:93
genesis::population::PoolDiversitySettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity.hpp:73
genesis::population::PoolDiversityResults
Data struct to collect all diversity statistics computed by pool_diversity_measures().
Definition: diversity.hpp:90