A library for working with phylogenetic data.
v0.25.0
diversity.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
36 
37 #include <cassert>
38 #include <cmath>
39 #include <mutex>
40 #include <stdexcept>
41 #include <unordered_map>
42 
43 #ifdef GENESIS_OPENMP
44 # include <omp.h>
45 #endif
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // Local Helper Functions
52 // =================================================================================================
53 
57 double amnm_( // get_aMnm_buffer
58  size_t poolsize, // n
59  size_t nucleotide_count, // M (coverage)
60  size_t allele_frequency // m, m_it (running variable for b .. M-b)
61 ) {
62  // The terminology in PoPoolation is confusing and differs completely from the paper,
63  // their code is not well documented, and their binomial_term function combines other aspects
64  // of the computation than just computing the binomial distribution,
65  // hence adding to the confusion. Let's try to disentangle:
66  //
67  // It seems, we want a binomial distribution with n being the coverage/nucleotide_count/M,
68  // and k being the allele_frequency/m, and we want p being r (1..poolsize-1) divided by poolsize,
69  // using the r from the below loop (which confusingly is also called k in PoPoolation).
70  // What a mess.
71 
73  size_t poolsize, size_t nucleotide_count, size_t allele_frequency
74  ) {
75  double result = 0.0;
76 
77  #pragma omp parallel for
78  for( size_t r = 1; r <= poolsize - 1; ++r ) {
79  double const p = static_cast<double>( r ) / static_cast<double>( poolsize );
80  double const binom = utils::binomial_distribution( allele_frequency, nucleotide_count, p );
81  double const partial = binom / static_cast<double>( r );
82 
83  #pragma omp atomic
84  result += partial;
85  }
86  return result;
87  }};
88 
89  return amnm_cache_( poolsize, nucleotide_count, allele_frequency );
90 }
91 
92 // =================================================================================================
93 // Diversity Estimates
94 // =================================================================================================
95 
96 double heterozygosity( BaseCounts const& sample )
97 {
98  using namespace genesis::utils;
99 
100  double h = 1.0;
101  double const nt_cnt = static_cast<double>( nucleotide_sum( sample ));
102  h -= squared( static_cast<double>( sample.a_count ) / nt_cnt );
103  h -= squared( static_cast<double>( sample.c_count ) / nt_cnt );
104  h -= squared( static_cast<double>( sample.g_count ) / nt_cnt );
105  h -= squared( static_cast<double>( sample.t_count ) / nt_cnt );
106 
107  // Slighly slower version with powers.
108  // h -= std::pow( static_cast<double>( sample.a_count ) / nt_cnt, 2 );
109  // h -= std::pow( static_cast<double>( sample.c_count ) / nt_cnt, 2 );
110  // h -= std::pow( static_cast<double>( sample.g_count ) / nt_cnt, 2 );
111  // h -= std::pow( static_cast<double>( sample.t_count ) / nt_cnt, 2 );
112  h *= nt_cnt / ( nt_cnt - 1.0 );
113  return h;
114 }
115 
116 // =================================================================================================
117 // Theta Pi
118 // =================================================================================================
119 
121  PoolDiversitySettings const& settings,
122  size_t nucleotide_count // M
123 ) {
124  // PoPoolation variable names:
125  // poolsize: n
126  // min_allele_count: b
127  // nucleotide_count: M
128 
129  // Local cache for speed.
131  size_t poolsize, size_t min_allele_count, size_t nucleotide_count
132  ){
133  // Boundary: if not held, we'd return zero, and that would not be a useful denominator.
134  if( 2 * min_allele_count > nucleotide_count ) {
135  throw std::invalid_argument(
136  "Cannot compute theta_pi_pool_denominator with min_allele_count = " +
137  std::to_string( min_allele_count ) + " and nucleotide_count = " +
138  std::to_string( nucleotide_count )
139  );
140  }
141 
142  // Iterate all allele frequencies in between the min and max-min boundaries.
143  double div = 0.0;
144 
145  #pragma omp parallel for
146  for( size_t m_it = min_allele_count; m_it <= ( nucleotide_count - min_allele_count ); ++m_it ) {
147  // We iterate from b to M-b (in PoPoolation terminology), inclusively.
148  // Use double values however for the computations.
149  double const m = static_cast<double>( m_it );
150  double const M = static_cast<double>( nucleotide_count );
151 
152  // Compute the term. We here use the cache, which also computes results if not yet cached.
153  double const term = ( 2.0 * m * ( M - m )) / ( M * ( M - 1.0 ));
154  double const partial = term * amnm_( poolsize, nucleotide_count, m_it );
155 
156  #pragma omp atomic
157  div += partial;
158  }
159  return div;
160  }};
161 
162  // Simply return the cached value (which computes them first if not yet cached).
163  return denom_cache_( settings.poolsize, settings.min_allele_count, nucleotide_count );
164 }
165 
166 // =================================================================================================
167 // Theta Watterson
168 // =================================================================================================
169 
171  PoolDiversitySettings const& settings,
172  size_t nucleotide_count // M
173 ) {
174  // PoPoolation variable names:
175  // poolsize: n
176  // min_allele_count: b
177  // nucleotide_count: M
178 
179  // Local cache for speed.
181  size_t poolsize, size_t min_allele_count, size_t nucleotide_count
182  ){
183  // Boundary: if not held, we'd return zero, and that would not be a useful denominator.
184  if( 2 * min_allele_count > nucleotide_count ) {
185  throw std::invalid_argument(
186  "Cannot compute theta_watterson_pool_denominator with min_allele_count = " +
187  std::to_string( min_allele_count ) + " and nucleotide_count = " +
188  std::to_string( nucleotide_count )
189  );
190  }
191 
192  // Iterate all allele frequencies in between the min and max-min boundaries.
193  double div = 0.0;
194 
195  #pragma omp parallel for
196  for( size_t m_it = min_allele_count; m_it <= ( nucleotide_count - min_allele_count ); ++m_it ) {
197 
198  // Compute the term. We here use the cache, which also computes results if not yet cached.
199  auto const anmn = amnm_( poolsize, nucleotide_count, m_it );
200 
201  #pragma omp atomic
202  div += anmn;
203  }
204  return div;
205  }};
206 
207  // Simply return the cached value (which computes them first if not yet cached).
208  return denom_cache_( settings.poolsize, settings.min_allele_count, nucleotide_count );
209 }
210 
211 // =================================================================================================
212 // Tajima's D Local Helpers
213 // =================================================================================================
214 
215 double a_n( size_t n ) // get_an_buffer
216 {
217  // Local cache for speed.
218  static genesis::utils::FunctionCache<double, size_t> a_n_cache_{ []( size_t n ){
219  double sum = 0.0;
220  for( size_t i = 1; i < n; ++i ) {
221  sum += 1.0 / static_cast<double>( i );
222  }
223  return sum;
224  }};
225  return a_n_cache_( n );
226 }
227 
228 double b_n( size_t n ) // get_bn_buffer
229 {
230  // Local cache for speed.
231  static genesis::utils::FunctionCache<double, size_t> b_n_cache_{ []( size_t n ){
232  double sum = 0.0;
233  for( size_t i = 1; i < n; ++i ) {
234  sum += 1.0 / ( static_cast<double>( i ) * static_cast<double>( i ));
235  }
236  return sum;
237  }};
238  return b_n_cache_( n );
239 }
240 
241 double f_star( double a_n, double n ) // calculate_fstar
242 {
243  double const nd = static_cast<double>( n );
244  return ( nd - 3.0 ) / ( a_n * ( nd - 1.0 ) - nd );
245 }
246 
247 double alpha_star( double n ) // get_alphastar_calculator
248 {
249  if( n <= 1 ) {
250  throw std::invalid_argument( "Cannot compute alpha star with effective coverage n <= 1" );
251  }
252 
253  // Local cache for speed.
254  static genesis::utils::FunctionCache<double, double> alpha_star_cache_{ []( double n ){
255  using namespace genesis::utils;
256 
257  // Prepare some constants: n as double, a_n, and f_star.
258  double const nd = static_cast<double>( n );
259  double const an = a_n( n );
260  double const fs = f_star( an, n );
261 
262  // Calculate individual terms (t) and subterms (ts).
263  auto const t1 = squared( fs ) * ( an - ( nd / ( nd - 1.0 )));
264  auto const t2s1 = an * (( 4.0 * ( nd + 1.0 )) / squared( nd - 1.0 ));
265  auto const t2s2 = 2.0 * (( nd + 3.0 ) / ( nd - 1.0 ));
266  auto const t2 = fs * ( t2s1 - t2s2 );
267  auto const t3 = an * (( 8.0 * ( nd + 1.0 )) / ( nd * squared( nd - 1.0 )));
268  auto const t4 = ( squared( nd ) + nd + 60.0 ) / ( 3.0 * nd * ( nd - 1.0 ));
269  return t1 + t2 - t3 + t4;
270  }};
271 
272  return alpha_star_cache_( n );
273 }
274 
275 double beta_star( double n ) // get_betastar_calculator
276 {
277  if( n <= 1 ) {
278  throw std::invalid_argument( "Cannot compute beta star with effective coverage n <= 1" );
279  }
280 
281  // Local cache for speed.
282  static genesis::utils::FunctionCache<double, double> beta_star_cache_{ []( double n ){
283  using namespace genesis::utils;
284 
285  // Prepare some constants: n as double, a_n, b_n, and f_star.
286  double const nd = static_cast<double>( n );
287  double const an = a_n( n );
288  double const bn = b_n( n );
289  double const fs = f_star( an, n );
290 
291  // Calculate individual terms (t) and subterms (ts).
292  auto const t1 = squared( fs ) * ( bn - (( 2.0 * ( nd - 1.0 )) / squared( nd - 1.0 )));
293  auto const t2s1 = bn * ( 8.0 / ( nd - 1.0 ));
294  auto const t2s2 = an * ( 4.0 / ( nd * ( nd - 1.0 )));
295  auto const t2s3n = cubed( nd ) + 12.0 * squared( nd ) - 35.0 * nd + 18.0;
296  auto const t2s3d = nd * squared( nd - 1.0 );
297  auto const t2s3 = t2s3n / t2s3d;
298  auto const t2 = fs * ( t2s1 - t2s2 - t2s3 );
299  auto const t3 = bn * ( 16.0 / ( nd * ( nd - 1.0 )));
300  auto const t4 = an * ( 8.0 / ( squared( nd ) * ( nd - 1.0 )));
301  auto const t5s1 = 2.0 * ( std::pow( nd, 4 ) + 110.0 * squared( nd ) - 255.0 * nd + 126.0 );
302  auto const t5s2 = 9.0 * ( squared( nd ) * squared( nd - 1.0 ));
303  auto const t5 = t5s1 / t5s2;
304  return t1 + t2 - t3 + t4 + t5;
305  }};
306 
307  return beta_star_cache_( n );
308 }
309 
311  size_t max_coverage, size_t poolsize
312 ) {
313  // Prepare a matrix with the needed dimensions. PoPoolation only computes this matrix
314  // for min( max_coverage, poolsize ) many columns, but we go all the way and compute
315  // all that is needed. Just seems cleaner. Also it avoids a bug that PoPoolation might have there.
316  // auto const max_width = max_coverage < poolsize ? max_coverage : poolsize;
317  auto const max_width = poolsize;
318  auto result = genesis::utils::Matrix<double>( max_coverage + 1, max_width + 1 );
319 
320  // Prepare double conversion constants.
321  double const poold = static_cast<double>( poolsize );
322 
323  // Init first row and column, and top left element.
324  result( 0, 0 ) = 1.0;
325  for( size_t i = 1; i < max_coverage + 1; ++i ) {
326  result( i, 0 ) = 0.0;
327  }
328  for( size_t j = 1; j < max_width + 1; ++j ) {
329  result( 0, j ) = 0.0;
330  }
331 
332  // Compute the remaining entries.
333  for( size_t i = 1; i < max_coverage + 1; ++i ) {
334  for( size_t j = 1; j < max_width + 1; ++j ) {
335  auto const t1s1 = (( 1.0 + poold - static_cast<double>( j )) / poold );
336  auto const t1s2 = result( i - 1, j - 1 );
337  auto const t2 = ( static_cast<double>( j ) / poold ) * result( i-1, j );
338  result( i, j ) = t1s1 * t1s2 + t2;
339  }
340  }
341  return result;
342 }
343 
344 genesis::utils::Matrix<double> const& pij_matrix_resolver_( // get_nbase_matrix_resolver
345  size_t max_coverage, size_t poolsize
346 ) {
347  // Here, we need to cache only by poolsize, but additionally make sure that for a given
348  // poolsize, the matrix is large enough for max_coverage.
349  // If it already is, we can just return it. If not, we compute a large enough matrix first.
350  // We could re-use data from the smaller matrix for the computation here, but that would
351  // be more complex. In terms of runtime, this amortizes pretty quickly, so probably no
352  // need to do this optimization as of now.
353  // We could modify FunctionCache to allow overwriting cached values from the outside,
354  // but that somehow seems unclean for all other standard use cases of that class.
355  // Hence, simple map here, so that we can do our own caching with overwriting.
356 
357  // Map from poolsizes to the matrix for that poolsize.
358  using PijMatrix = genesis::utils::Matrix<double>;
359  static std::unordered_map<size_t, PijMatrix> pij_matrix_cache_;
360 
361  // Make sure this function is called thread savely, for concurrent access to the cache.
362  static std::mutex guard_;
363  std::lock_guard<std::mutex> lock( guard_ );
364 
365  // Check if we already have the correct size, and re-compute if not.
366  if(
367  pij_matrix_cache_.count( poolsize ) == 0 ||
368  max_coverage >= pij_matrix_cache_.at( poolsize ).rows() ||
369  poolsize + 1 != pij_matrix_cache_.at( poolsize ).cols()
370  ) {
371  // Get a bit of leeway to reduce recomputation. Or maybe this is about the approximation
372  // that PoPollation does. Not sure. We just copied their approach here...
373  pij_matrix_cache_[ poolsize ] = pij_matrix_( 3 * max_coverage, poolsize );
374  }
375 
376  assert( pij_matrix_cache_.count( poolsize ) > 0 );
377  assert( max_coverage <= pij_matrix_cache_.at( poolsize ).rows() );
378  assert( poolsize <= pij_matrix_cache_.at( poolsize ).cols() );
379  return pij_matrix_cache_[ poolsize ];
380 }
381 
382 double n_base_matrix( size_t coverage, size_t poolsize ) // get_nbase_buffer
383 {
384  // Shortcut? Boundary check? PoPoolation is not clear on this...
385  // Also, might not be needed at all. Who knows. Not me.
386  // News: apparently, not needed, as we cover that case below by flipping roles of the variables.
387  // Not sure why. But let's do as PoPoolation does!
388  // if( poolsize <= coverage ) {
389  // throw std::invalid_argument(
390  // "Cannot compute nbase with poolsize == " + std::to_string( poolsize ) +
391  // " <= coverage == " + std::to_string( coverage )
392  // );
393  // }
394 
395  // Local cache for speed.
397  size_t coverage, size_t poolsize
398  ) {
399 
400  // Get the matrix, cached and by reference, to optimize the access.
401  auto const& pij_matrix = pij_matrix_resolver_( coverage, poolsize );
402 
403  double nbase = 0.0;
404  auto const minj = ( coverage < poolsize ) ? coverage : poolsize;
405  for( size_t k = 1; k <= minj; ++k ) {
406  assert( coverage < pij_matrix.rows() );
407  assert( k < pij_matrix.cols() );
408  nbase += static_cast<double>( k ) * pij_matrix( coverage, k );
409  }
410  return nbase;
411  }};
412 
413  return nbase_cache_( coverage, poolsize );
414 }
415 
416 double n_base( size_t coverage, size_t poolsize ) // get_nbase_buffer, but better
417 {
418  // The following simple closed form is equivalent to the way more complicated equation given
419  // in that hidden PoPoolation auxiliary equations document. See
420  // https://math.stackexchange.com/questions/72223/finding-expected-number-of-distinct-values-selected-from-a-set-of-integers
421  // for the proof. At the time of writing this, we are however still lacking the proof that
422  // the PoPoolation equation and the PoPoolation implementation are equivalent - they never
423  // show that, and instead just use their recursive dynamic programming approach (which we
424  // re-implemented above) without ever showing (to the best of our knowledge) that this is
425  // the same as the given equation.
426  double const p = static_cast<double>( coverage );
427  double const n = static_cast<double>( poolsize );
428  return n * ( 1.0 - std::pow(( n - 1.0 ) / n, p ));
429 }
430 
431 // =================================================================================================
432 // Tajima's D
433 // =================================================================================================
434 
435 double tajima_d_pool_denominator( // get_ddivisor
436  PoolDiversitySettings const& settings,
437  size_t snp_count,
438  double theta
439 ) {
440  // PoPoolation variable names:
441  // poolsize: n
442  // min_allele_count: b
443  // nucleotide_count: M
444 
445  using namespace genesis::utils;
446 
447  // Edge cases.
448  if( settings.min_allele_count != 2 ) {
449  throw std::invalid_argument(
450  "Minimum allele count needs to be set to 2 for calculating pool-corrected Tajima's D "
451  "with tajima_d_pool(). In case 2 is insufficient, we recommend to subsample the reads "
452  "to a smaller coverage."
453  );
454  }
455  if( 3 * settings.min_coverage >= settings.poolsize ) {
456  throw std::invalid_argument(
457  "Invalid mincoverage >> poolsize (as internal aproximation we use: "
458  "3 * minimumcoverage < poolsize) in tajima_d_pool()"
459  );
460  }
461 
462  // TODO The average of n seems to be calcualted as the expected value of picking distinct
463  // individuals from a pool. This value is however not an integer. But the alpha star and
464  // beta star computations assume integers. Not sure what to make of this...
465 
466  // We here re-implement two bugs from PoPoolation that massively change the results.
467  // We do this in order to be able to ensure that these are the only differences between
468  // our code and PoPoolation. It is weird and freaky though to conciously implement bugs...
469  double avg_n;
470  double alphastar;
471  double betastar;
472  if( settings.with_popoolation_bugs ) {
473  avg_n = n_base( settings.poolsize, settings.poolsize );
474  alphastar = static_cast<double>( beta_star( avg_n ));
475  betastar = alphastar;
476  } else {
477  avg_n = n_base( settings.min_coverage, settings.poolsize );
478  alphastar = static_cast<double>( alpha_star( avg_n ));
479  betastar = static_cast<double>( beta_star( avg_n ));;
480  }
481 
482  return std::sqrt(
483  ( alphastar / static_cast<double>( snp_count )) * theta + betastar * squared( theta )
484  );
485 }
486 
487 } // namespace population
488 } // namespace genesis
genesis::population::pij_matrix_
genesis::utils::Matrix< double > pij_matrix_(size_t max_coverage, size_t poolsize)
Definition: diversity.cpp:310
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::population::a_n
double a_n(size_t n)
Compute a_n, the sum of reciprocals.
Definition: diversity.cpp:215
genesis::population::BaseCounts::t_count
size_t t_count
Count of all T nucleotides that are present in the sample.
Definition: base_counts.hpp:73
genesis::population::PoolDiversitySettings::min_coverage
size_t min_coverage
Definition: diversity.hpp:81
genesis::population::amnm_
double amnm_(size_t poolsize, size_t nucleotide_count, size_t allele_frequency)
Local helper function to compute values for the denominator.
Definition: diversity.cpp:57
genesis::population::f_star
double f_star(double a_n, double n)
Compute f* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:241
common.hpp
genesis::population::PoolDiversitySettings::min_allele_count
size_t min_allele_count
Definition: diversity.hpp:80
genesis::population::BaseCounts::g_count
size_t g_count
Count of all G nucleotides that are present in the sample.
Definition: base_counts.hpp:68
genesis::utils::cubed
constexpr double cubed(double x)
Cube of a number.
Definition: common.hpp:253
genesis::utils::binomial_distribution
double binomial_distribution(size_t k, size_t n, double p)
Compute the probability mass function for a binomial distribution.
Definition: common.hpp:107
genesis::utils::FunctionCache
Simple cache, for example for function return values.
Definition: function_cache.hpp:81
genesis::population::BaseCounts::a_count
size_t a_count
Count of all A nucleotides that are present in the sample.
Definition: base_counts.hpp:58
genesis::population::tajima_d_pool_denominator
double tajima_d_pool_denominator(PoolDiversitySettings const &settings, size_t snp_count, double theta)
Compute the denominator for the pool-sequencing correction of Tajima's D according to Kofler et al.
Definition: diversity.cpp:435
genesis::population::heterozygosity
double heterozygosity(BaseCounts const &sample)
Compute classic heterozygosity.
Definition: diversity.cpp:96
diversity.hpp
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::utils::Matrix< double >
genesis::population::PoolDiversitySettings::poolsize
size_t poolsize
Definition: diversity.hpp:79
function_cache.hpp
genesis::population::n_base_matrix
double n_base_matrix(size_t coverage, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, following their approach.
Definition: diversity.cpp:382
genesis::population::pij_matrix_resolver_
genesis::utils::Matrix< double > const & pij_matrix_resolver_(size_t max_coverage, size_t poolsize)
Definition: diversity.cpp:344
genesis::population::theta_watterson_pool_denominator
double theta_watterson_pool_denominator(PoolDiversitySettings const &settings, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta watterson according to Kofler et ...
Definition: diversity.cpp:170
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: functions/base_counts.hpp:177
matrix.hpp
genesis::population::b_n
double b_n(size_t n)
Compute b_n, the sum of squared reciprocals.
Definition: diversity.cpp:228
genesis::population::n_base
double n_base(size_t coverage, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, using a faster closed form express...
Definition: diversity.cpp:416
genesis::population::BaseCounts::c_count
size_t c_count
Count of all C nucleotides that are present in the sample.
Definition: base_counts.hpp:63
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:53
genesis::population::beta_star
double beta_star(double n)
Compute beta* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:275
genesis::population::theta_pi_pool_denominator
double theta_pi_pool_denominator(PoolDiversitySettings const &settings, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta pi according to Kofler et al.
Definition: diversity.cpp:120
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::population::PoolDiversitySettings::with_popoolation_bugs
bool with_popoolation_bugs
Definition: diversity.hpp:84
genesis::population::alpha_star
double alpha_star(double n)
Compute alpha* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity.cpp:247
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:241
genesis::population::PoolDiversitySettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity.hpp:73