A library for working with phylogenetic and population genetic data.
v0.32.0
diversity_pool_functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
37 
38 #include <cassert>
39 #include <cmath>
40 #include <limits>
41 #include <mutex>
42 #include <stdexcept>
43 #include <unordered_map>
44 
45 // #ifdef GENESIS_OPENMP
46 // # include <omp.h>
47 // #endif
48 
49 namespace genesis {
50 namespace population {
51 
52 // =================================================================================================
53 // Local Helper Functions
54 // =================================================================================================
55 
65 double amnm_( // get_aMnm_buffer
66  size_t poolsize, // n
67  size_t nucleotide_count, // M (coverage/read depth)
68  size_t allele_frequency // m, m_it (running variable for b .. M-b)
69 ) {
70  // The terminology in PoPoolation is confusing and differs completely from the paper,
71  // their code is not well documented, and their binomial_term function combines other aspects
72  // of the computation than just computing the binomial distribution,
73  // hence adding to the confusion. Let's try to disentangle:
74  //
75  // It seems, we want a binomial distribution with n being the coverage/nucleotide_count/M,
76  // and k being the allele_frequency/m, and we want p being r (1..poolsize-1) divided by poolsize,
77  // using the r from the below loop (which confusingly is also called k in PoPoolation).
78  // What a mess.
79 
80  // PoPoolation uses a function caching mechanism here, which we also initially did:
81  // We used to have the following function cache here, but it turned out that this actually
82  // was doing more harm than good. It uses a lot of memory (prohibitively so for datasets
83  // with large unfiltered/un-subsampled read depth), and even adds a little bit of extra runtime
84  // due to the cache lookups. At the same time, there is not really a upside here, as the
85  // values computed here are only used locally in theta_pi_pool_denominator() and
86  // theta_watterson_pool_denominator(), which already do their own caching. Due to that,
87  // I think that each value computed here was only requested once exactly by each of those
88  // two functions, so at maximum re-used once, which kind of defies the purpose of caching.
89  // So, deactivated. Way less memory, and a slight improvement in runtim in our tests at least.
90 
91  // static genesis::utils::FunctionCache<double, size_t, size_t, size_t> amnm_cache_{ [](
92  // size_t poolsize, size_t nucleotide_count, size_t allele_frequency
93  // ) {
94  // // ...
95  // }};
96  // return amnm_cache_( poolsize, nucleotide_count, allele_frequency );
97 
98  // Edge case check.
99  if( allele_frequency == 0 ) {
100  throw std::invalid_argument(
101  "In computing amnm_(), allele_frequency == 0 is not allowed. "
102  "This is likely caused by using DiversityPoolSettings.min_count == 0."
103  );
104  }
105 
106  // We need a binomial distribution in the loop below for which the coefficient stays
107  // constant. So we pre-compute that here, and split the computation into its parts.
108  // Quick test on some real data reduced the runtime by 30% using this.
109  // Also, we are staying in log-space until the very end to allow large n and small p;
110  // see log_binomial_distribution() for the underlying implementation.
111  auto const k = allele_frequency;
112  auto const n = nucleotide_count;
113  auto const log_coeff = utils::log_binomial_coefficient( n, k );
114  assert( k <= n );
115 
116  // #pragma omp parallel for
117  double result = 0.0;
118  for( size_t r = 1; r <= poolsize - 1; ++r ) {
119 
120  // Get the probability that we are looking at in this loop iteration.
121  double const p = static_cast<double>( r ) / static_cast<double>( poolsize );
122  assert( std::isfinite( p ) && 0.0 < p && p < 1.0 );
123 
124  // Compute the remaining parts of the binomial that depend on p.
125  // Below, we have the original full function for reference.
126  double const log_pow_1 = static_cast<double>( k ) * std::log( p );
127  double const log_pow_2 = static_cast<double>( n - k ) * std::log( 1.0 - p );
128  double const binom = std::exp( log_coeff + log_pow_1 + log_pow_2 );
129  // double const binom = utils::binomial_distribution(
130  // allele_frequency, nucleotide_count, p
131  // );
132 
133  // Sum up the term.
134  double const partial = binom / static_cast<double>( r );
135  // #pragma omp atomic
136  result += partial;
137 
138  // Early abort. No need to continue once we reach inf or nan.
139  if( ! std::isfinite( result )) {
140  break;
141  }
142  }
143  return result;
144 }
145 
146 // =================================================================================================
147 // Theta Pi
148 // =================================================================================================
149 
150 double heterozygosity( SampleCounts const& sample, bool with_bessel )
151 {
152  using namespace genesis::utils;
153 
154  double h = 1.0;
155  double const nt_cnt = static_cast<double>( nucleotide_sum( sample ));
156  h -= squared( static_cast<double>( sample.a_count ) / nt_cnt );
157  h -= squared( static_cast<double>( sample.c_count ) / nt_cnt );
158  h -= squared( static_cast<double>( sample.g_count ) / nt_cnt );
159  h -= squared( static_cast<double>( sample.t_count ) / nt_cnt );
160 
161  // Slighly slower version with powers.
162  // h -= std::pow( static_cast<double>( sample.a_count ) / nt_cnt, 2 );
163  // h -= std::pow( static_cast<double>( sample.c_count ) / nt_cnt, 2 );
164  // h -= std::pow( static_cast<double>( sample.g_count ) / nt_cnt, 2 );
165  // h -= std::pow( static_cast<double>( sample.t_count ) / nt_cnt, 2 );
166  if( with_bessel ) {
167  h *= nt_cnt / ( nt_cnt - 1.0 );
168  }
169  return h;
170 }
171 
173  DiversityPoolSettings const& settings,
174  size_t poolsize, // n
175  size_t nucleotide_count // M
176 ) {
177  // PoPoolation variable names:
178  // min_count: b
179  // poolsize: n
180  // nucleotide_count: M
181 
182  // Local cache for speed.
184  size_t min_count, size_t poolsize, size_t nucleotide_count
185  ){
186  // Boundary: if not held, we'd return zero, and that would not be a useful denominator.
187  // Update: It's okay, we can return 0 here. The position will just not contribute
188  // to the overall diversity sum then, but still considered for the sum of valid positions.
189  if( 2 * min_count > nucleotide_count ) {
190  return 0.0;
191  // throw std::invalid_argument(
192  // "Cannot compute theta_pi_pool_denominator with min_count = " +
193  // std::to_string( min_count ) + " and nucleotide_count = " +
194  // std::to_string( nucleotide_count )
195  // );
196  }
197 
198  // Iterate all allele frequencies in between the min and max-min boundaries.
199  double denom = 0.0;
200 
201  // #pragma omp parallel for
202  for( size_t m_it = min_count; m_it <= ( nucleotide_count - min_count ); ++m_it ) {
203  // We iterate from b to M-b (in PoPoolation terminology), inclusively.
204  // Use double values however for the computations.
205  double const m = static_cast<double>( m_it );
206  double const M = static_cast<double>( nucleotide_count );
207 
208  // Compute the term. We here use the cache, which also computes results if not yet cached.
209  double const term = ( 2.0 * m * ( M - m )) / ( M * ( M - 1.0 ));
210  double const partial = term * amnm_( poolsize, nucleotide_count, m_it );
211 
212  // #pragma omp atomic
213  denom += partial;
214 
215  // Early abort. No need to continue once we reach inf or nan.
216  if( ! std::isfinite( denom )) {
217  break;
218  }
219  }
220  return denom;
221  }};
222 
223  // Simply return the cached value (which computes them first if not yet cached).
224  return denom_cache_( settings.min_count, poolsize, nucleotide_count );
225 }
226 
227 // =================================================================================================
228 // Theta Watterson
229 // =================================================================================================
230 
232  DiversityPoolSettings const& settings,
233  size_t poolsize,
234  size_t nucleotide_count // M
235 ) {
236  // PoPoolation variable names:
237  // min_count: b
238  // poolsize: n
239  // nucleotide_count: M
240 
241  // Local cache for speed.
243  size_t min_count, size_t poolsize, size_t nucleotide_count
244  ){
245  // Boundary: if not held, we'd return zero, and that would not be a useful denominator.
246  // Update: It's okay, we can return 0 here. The position will just not contribute
247  // to the overall diversity sum then, but still considered for the sum of valid positions.
248  if( 2 * min_count > nucleotide_count ) {
249  return 0.0;
250  // throw std::invalid_argument(
251  // "Cannot compute theta_watterson_pool_denominator with min_count = " +
252  // std::to_string( min_count ) + " and nucleotide_count = " +
253  // std::to_string( nucleotide_count )
254  // );
255  }
256 
257  // Iterate all allele frequencies in between the min and max-min boundaries.
258  double denom = 0.0;
259 
260  // #pragma omp parallel for
261  for( size_t m_it = min_count; m_it <= ( nucleotide_count - min_count ); ++m_it ) {
262 
263  // Compute the term. We here use the cache, which also computes results if not yet cached.
264  auto const anmn = amnm_( poolsize, nucleotide_count, m_it );
265 
266  // #pragma omp atomic
267  denom += anmn;
268 
269  // Early abort. No need to continue once we reach inf or nan.
270  if( ! std::isfinite( denom )) {
271  break;
272  }
273  }
274  return denom;
275  }};
276 
277  // Simply return the cached value (which computes them first if not yet cached).
278  return denom_cache_( settings.min_count, poolsize, nucleotide_count );
279 }
280 
281 // =================================================================================================
282 // Tajima's D Helper Functions
283 // =================================================================================================
284 
285 double a_n( double n ) // get_an_buffer
286 {
287  // Local cache for speed.
288  static genesis::utils::FunctionCache<double, size_t> a_n_cache_{ []( size_t n ){
289  double sum = 0.0;
290  for( size_t i = 1; i < n; ++i ) {
291  sum += 1.0 / static_cast<double>( i );
292  }
293  return sum;
294  }};
295 
296  // The n value that we get here is a double, because following PoPoolation, we compute it
297  // as n_tilde, which is not integer... but we need to use it as an integer here,
298  // and the way that PoPoolation computes n_tilde, it is around 1.99,
299  // so we want to make sure to round to the nearest number, I think.
300  // We do that here, before the cache function call, so that the cache function does not
301  // get affected by close but non-identical doubles that round to the same int.
302  assert( std::isfinite(n) && n >= 0.0 );
303  auto const ni = static_cast<size_t>( std::round( n ));
304  return a_n_cache_( ni );
305 }
306 
307 double b_n( double n ) // get_bn_buffer
308 {
309  // Local cache for speed.
310  static genesis::utils::FunctionCache<double, size_t> b_n_cache_{ []( size_t n ){
311  double sum = 0.0;
312  for( size_t i = 1; i < n; ++i ) {
313  sum += 1.0 / ( static_cast<double>( i ) * static_cast<double>( i ));
314  }
315  return sum;
316  }};
317 
318  // Same logic as in a_n(), see there for details.
319  assert( std::isfinite(n) && n >= 0.0 );
320  auto const ni = static_cast<size_t>( std::round( n ));
321  return b_n_cache_( ni );
322 }
323 
324 double f_star( double a_n, double n ) // calculate_fstar
325 {
326  double const nd = static_cast<double>( n );
327  return ( nd - 3.0 ) / ( a_n * ( nd - 1.0 ) - nd );
328 }
329 
330 double alpha_star( double n ) // get_alphastar_calculator
331 {
332  if( n <= 1 ) {
333  throw std::invalid_argument( "Cannot compute alpha star with effective read depth n <= 1" );
334  }
335 
336  // Local cache for speed.
337  static genesis::utils::FunctionCache<double, double> alpha_star_cache_{ []( double n ){
338  using namespace genesis::utils;
339 
340  // Prepare some constants: n as double, a_n, and f_star.
341  double const nd = static_cast<double>( n );
342  double const an = a_n( n );
343  double const fs = f_star( an, n );
344 
345  // Calculate individual terms (t) and subterms (ts).
346  auto const t1 = squared( fs ) * ( an - ( nd / ( nd - 1.0 )));
347  auto const t2s1 = an * (( 4.0 * ( nd + 1.0 )) / squared( nd - 1.0 ));
348  auto const t2s2 = 2.0 * (( nd + 3.0 ) / ( nd - 1.0 ));
349  auto const t2 = fs * ( t2s1 - t2s2 );
350  auto const t3 = an * (( 8.0 * ( nd + 1.0 )) / ( nd * squared( nd - 1.0 )));
351  auto const t4 = ( squared( nd ) + nd + 60.0 ) / ( 3.0 * nd * ( nd - 1.0 ));
352  return t1 + t2 - t3 + t4;
353  }};
354 
355  return alpha_star_cache_( n );
356 }
357 
358 double beta_star( double n ) // get_betastar_calculator
359 {
360  if( n <= 1 ) {
361  throw std::invalid_argument( "Cannot compute beta star with effective read depth n <= 1" );
362  }
363 
364  // Local cache for speed.
365  static genesis::utils::FunctionCache<double, double> beta_star_cache_{ []( double n ){
366  using namespace genesis::utils;
367 
368  // Prepare some constants: n as double, a_n, b_n, and f_star.
369  double const nd = static_cast<double>( n );
370  double const an = a_n( n );
371  double const bn = b_n( n );
372  double const fs = f_star( an, n );
373 
374  // Calculate individual terms (t) and subterms (ts).
375  // The first term t1 has a mistake in PoPoolation, where they use 2 * ( n - 1 )
376  // instead of ( 2 * n ) - 1, which we have fixed here.
377  auto const t1 = squared( fs ) * ( bn - ((( 2.0 * nd ) - 1.0 ) / squared( nd - 1.0 )));
378  auto const t2s1 = bn * ( 8.0 / ( nd - 1.0 ));
379  auto const t2s2 = an * ( 4.0 / ( nd * ( nd - 1.0 )));
380  auto const t2s3n = cubed( nd ) + 12.0 * squared( nd ) - 35.0 * nd + 18.0;
381  auto const t2s3d = nd * squared( nd - 1.0 );
382  auto const t2s3 = t2s3n / t2s3d;
383  auto const t2 = fs * ( t2s1 - t2s2 - t2s3 );
384  auto const t3 = bn * ( 16.0 / ( nd * ( nd - 1.0 )));
385  auto const t4 = an * ( 8.0 / ( squared( nd ) * ( nd - 1.0 )));
386  auto const t5s1 = 2.0 * ( std::pow( nd, 4 ) + 110.0 * squared( nd ) - 255.0 * nd + 126.0 );
387  auto const t5s2 = 9.0 * ( squared( nd ) * squared( nd - 1.0 ));
388  auto const t5 = t5s1 / t5s2;
389  return t1 + t2 - t3 + t4 + t5;
390  }};
391 
392  return beta_star_cache_( n );
393 }
394 
396  size_t max_read_depth, size_t poolsize
397 ) {
398  // Prepare a matrix with the needed dimensions. PoPoolation only computes this matrix
399  // for min( max_read_depth, poolsize ) many columns, but we go all the way and compute
400  // all that is needed. Just seems cleaner. Also it avoids a bug that PoPoolation might have there.
401  // auto const max_width = max_read_depth < poolsize ? max_read_depth : poolsize;
402  auto const max_width = poolsize;
403  auto result = genesis::utils::Matrix<double>( max_read_depth + 1, max_width + 1 );
404 
405  // Prepare double conversion constants.
406  double const poold = static_cast<double>( poolsize );
407 
408  // Init first row and column, and top left element.
409  result( 0, 0 ) = 1.0;
410  for( size_t i = 1; i < max_read_depth + 1; ++i ) {
411  result( i, 0 ) = 0.0;
412  }
413  for( size_t j = 1; j < max_width + 1; ++j ) {
414  result( 0, j ) = 0.0;
415  }
416 
417  // Compute the remaining entries.
418  for( size_t i = 1; i < max_read_depth + 1; ++i ) {
419  for( size_t j = 1; j < max_width + 1; ++j ) {
420  auto const t1s1 = (( 1.0 + poold - static_cast<double>( j )) / poold );
421  auto const t1s2 = result( i - 1, j - 1 );
422  auto const t2 = ( static_cast<double>( j ) / poold ) * result( i-1, j );
423  result( i, j ) = t1s1 * t1s2 + t2;
424  }
425  }
426  return result;
427 }
428 
429 genesis::utils::Matrix<double> const& pij_matrix_resolver_( // get_nbase_matrix_resolver
430  size_t max_read_depth, size_t poolsize
431 ) {
432  // Here, we need to cache only by poolsize, but additionally make sure that for a given
433  // poolsize, the matrix is large enough for max_read_depth.
434  // If it already is, we can just return it. If not, we compute a large enough matrix first.
435  // We could re-use data from the smaller matrix for the computation here, but that would
436  // be more complex. In terms of runtime, this amortizes pretty quickly, so probably no
437  // need to do this optimization as of now.
438  // We could modify FunctionCache to allow overwriting cached values from the outside,
439  // but that somehow seems unclean for all other standard use cases of that class.
440  // Hence, simple map here, so that we can do our own caching with overwriting.
441 
442  // Map from poolsizes to the matrix for that poolsize.
443  using PijMatrix = genesis::utils::Matrix<double>;
444  static std::unordered_map<size_t, PijMatrix> pij_matrix_cache_;
445 
446  // Make sure this function is called thread savely, for concurrent access to the cache.
447  static std::mutex guard_;
448  std::lock_guard<std::mutex> lock( guard_ );
449 
450  // Check if we already have the correct size, and re-compute if not.
451  if(
452  pij_matrix_cache_.count( poolsize ) == 0 ||
453  max_read_depth >= pij_matrix_cache_.at( poolsize ).rows() ||
454  poolsize + 1 != pij_matrix_cache_.at( poolsize ).cols()
455  ) {
456  // Get a bit of leeway to reduce recomputation. Or maybe this is about the approximation
457  // that PoPoolation does. Not sure. We just copied their approach here...
458  pij_matrix_cache_[ poolsize ] = pij_matrix_( 3 * max_read_depth, poolsize );
459  }
460 
461  assert( pij_matrix_cache_.count( poolsize ) > 0 );
462  assert( max_read_depth <= pij_matrix_cache_.at( poolsize ).rows() );
463  assert( poolsize <= pij_matrix_cache_.at( poolsize ).cols() );
464  return pij_matrix_cache_[ poolsize ];
465 }
466 
467 double n_base_matrix( size_t read_depth, size_t poolsize ) // get_nbase_buffer
468 {
469  // Shortcut? Boundary check? PoPoolation is not clear on this...
470  // Also, might not be needed at all. Who knows. Not me.
471  // News: apparently, not needed, as we cover that case below by flipping roles of the variables.
472  // Not sure why. But let's do as PoPoolation does! Has been flawless before...
473  // if( poolsize <= read_depth ) {
474  // throw std::invalid_argument(
475  // "Cannot compute nbase with poolsize == " + std::to_string( poolsize ) +
476  // " <= read_depth == " + std::to_string( read_depth )
477  // );
478  // }
479 
480  // Local cache for speed.
482  size_t read_depth, size_t poolsize
483  ) {
484 
485  // Get the matrix, cached and by reference, to optimize the access.
486  auto const& pij_matrix = pij_matrix_resolver_( read_depth, poolsize );
487 
488  double nbase = 0.0;
489  auto const minj = ( read_depth < poolsize ) ? read_depth : poolsize;
490  for( size_t k = 1; k <= minj; ++k ) {
491  assert( read_depth < pij_matrix.rows() );
492  assert( k < pij_matrix.cols() );
493  nbase += static_cast<double>( k ) * pij_matrix( read_depth, k );
494  }
495  return nbase;
496  }};
497 
498  return nbase_cache_( read_depth, poolsize );
499 }
500 
501 double n_base( size_t read_depth, size_t poolsize ) // get_nbase_buffer, but better
502 {
503  // The following simple closed form is equivalent to the way more complicated equation given
504  // in that hidden PoPoolation auxiliary equations document. See
505  // https://math.stackexchange.com/questions/72223/finding-expected-number-of-distinct-values-selected-from-a-set-of-integers
506  // for the proof. At the time of writing this, we are however still lacking the proof that
507  // the PoPoolation equation and the PoPoolation implementation are equivalent - they never
508  // show that, and instead just use their recursive dynamic programming approach (which we
509  // re-implemented above) without ever showing (to the best of our knowledge) that this is
510  // the same as the given equation.
511  double const p = static_cast<double>( read_depth );
512  double const n = static_cast<double>( poolsize );
513  return n * ( 1.0 - std::pow(( n - 1.0 ) / n, p ));
514 }
515 
516 // =================================================================================================
517 // Tajima's D
518 // =================================================================================================
519 
520 double tajima_d_pool_denominator( // get_ddivisor
521  DiversityPoolSettings const& settings,
522  double theta,
523  size_t poolsize, // n
524  double window_avg_denom,
525  size_t empirical_min_read_depth
526 ) {
527  // PoPoolation variable names:
528  // min_count: b
529  // poolsize: n
530  // nucleotide_count: M
531 
532  using namespace genesis::utils;
533 
534  // Edge cases, only relevant for the Kofler-based correction denomiator variants.
535  if(
538  ) {
539  if( settings.min_count != 2 ) {
540  throw std::invalid_argument(
541  "Minimum allele count needs to be set to 2 for calculating pool-corrected Tajima's D "
542  "with tajima_d_pool() according to Kofler et al. In case 2 is insufficient, "
543  "we recommend to subsample the reads to a smaller read depth."
544  );
545  }
546  if( settings.min_read_depth == 0 ) {
547  throw std::invalid_argument(
548  "Minimum read depth of 0 is not valid for calculating pool-corrected Tajima's D "
549  "with tajima_d_pool()."
550  );
551  }
552  if( 3 * settings.min_read_depth >= poolsize ) {
553  throw std::invalid_argument(
554  "Invalid minimum read depth >> pool size (as internal aproximation we use: "
555  "3 * minimum read depth < pool size) in tajima_d_pool()"
556  );
557  }
558  }
559 
560  double alphastar;
561  double betastar;
562  switch( settings.tajima_denominator_policy ) {
564  {
565  // Use the emprical minimum read depth to get the value.
566  auto const avg_n = n_base( empirical_min_read_depth, poolsize );
567  alphastar = alpha_star( avg_n );
568  betastar = beta_star( avg_n );
569  break;
570  }
572  {
573  // Fix the bugs from above, but still use the user min cov for n_base.
574  auto const avg_n = n_base( settings.min_read_depth, poolsize );
575  alphastar = alpha_star( avg_n );
576  betastar = beta_star( avg_n );
577  break;
578  }
580  {
581  // We here re-implement two bugs from PoPoolation that massively change the results.
582  // We do this in order to be able to ensure that these are the only differences between
583  // our code and PoPoolation. It is weird and freaky though to conciously implement bugs...
584  auto const avg_n = n_base( poolsize, poolsize );
585  alphastar = beta_star( avg_n );
586  betastar = alphastar;
587  break;
588  }
590  {
591  // Use the pool size instead of anything n_base based.
592  alphastar = alpha_star( poolsize );
593  betastar = beta_star( poolsize );
594  break;
595  }
597  {
598  // No correction at all.
599  return 1.0;
600  }
601  default: {
602  throw std::invalid_argument( "Invalid enum value for TajimaDenominatorPolicy" );
603  }
604  }
605 
606  return std::sqrt(
607  ( alphastar / window_avg_denom ) * theta + betastar * squared( theta )
608  );
609 }
610 
611 } // namespace population
612 } // namespace genesis
genesis::utils::log_binomial_coefficient
double log_binomial_coefficient(size_t n, size_t k)
Compute the logarithm (base e) of the binomial coefficient, that is n choose k, for two integer numbe...
Definition: binomial.cpp:497
binomial.hpp
genesis::population::DiversityPoolSettings::tajima_denominator_policy
TajimaDenominatorPolicy tajima_denominator_policy
Definition: diversity_pool_functions.hpp:147
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::population::pij_matrix_resolver_
genesis::utils::Matrix< double > const & pij_matrix_resolver_(size_t max_read_depth, size_t poolsize)
Definition: diversity_pool_functions.cpp:429
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
diversity_pool_functions.hpp
genesis::population::theta_watterson_pool_denominator
double theta_watterson_pool_denominator(DiversityPoolSettings const &settings, size_t poolsize, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta watterson according to Kofler et ...
Definition: diversity_pool_functions.cpp:231
genesis::population::amnm_
double amnm_(size_t poolsize, size_t nucleotide_count, size_t allele_frequency)
Local helper function to compute values for the denominator.
Definition: diversity_pool_functions.cpp:65
genesis::population::f_star
double f_star(double a_n, double n)
Compute f* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity_pool_functions.cpp:324
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
common.hpp
genesis::population::theta_pi_pool_denominator
double theta_pi_pool_denominator(DiversityPoolSettings const &settings, size_t poolsize, size_t nucleotide_count)
Compute the denominator for the pool-sequencing correction of theta pi according to Kofler et al.
Definition: diversity_pool_functions.cpp:172
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::TajimaDenominatorPolicy::kPoolsize
@ kPoolsize
Instead of using n_base() to obtain the number of individuals sequenced (empirical pool size),...
genesis::utils::cubed
constexpr double cubed(double x)
Cube of a number.
Definition: common.hpp:235
genesis::utils::FunctionCache
Simple cache, for example for function return values.
Definition: function_cache.hpp:81
genesis::population::TajimaDenominatorPolicy::kUncorrected
@ kUncorrected
Do not correct Tajima's D at all.
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::utils::Matrix< double >
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
function_cache.hpp
genesis::population::DiversityPoolSettings::min_read_depth
size_t min_read_depth
Definition: diversity_pool_functions.hpp:144
genesis::population::pij_matrix_
genesis::utils::Matrix< double > pij_matrix_(size_t max_read_depth, size_t poolsize)
Definition: diversity_pool_functions.cpp:395
genesis::population::n_base
double n_base(size_t read_depth, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, using a faster closed form express...
Definition: diversity_pool_functions.cpp:501
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
matrix.hpp
genesis::population::TajimaDenominatorPolicy::kEmpiricalMinReadDepth
@ kEmpiricalMinReadDepth
Use the empirical minimum read depth found in each window to compute the expected number of individua...
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::b_n
double b_n(double n)
Compute b_n, the sum of squared reciprocals.
Definition: diversity_pool_functions.cpp:307
genesis::population::n_base_matrix
double n_base_matrix(size_t read_depth, size_t poolsize)
Compute the n_base term used for Tajima's D in Kofler et al. 2011, following their approach.
Definition: diversity_pool_functions.cpp:467
genesis::population::beta_star
double beta_star(double n)
Compute beta* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity_pool_functions.cpp:358
genesis::population::DiversityPoolSettings::min_count
size_t min_count
Definition: diversity_pool_functions.hpp:143
genesis::population::a_n
double a_n(double n)
Compute a_n, the sum of reciprocals.
Definition: diversity_pool_functions.cpp:285
genesis::population::TajimaDenominatorPolicy::kWithPopoolationBugs
@ kWithPopoolationBugs
Replicate the original behaviour of PoPoolation <= 1.2.2.
genesis::population::alpha_star
double alpha_star(double n)
Compute alpha* according to Achaz 2008 and Kofler et al. 2011.
Definition: diversity_pool_functions.cpp:330
genesis::population::TajimaDenominatorPolicy::kProvidedMinReadDepth
@ kProvidedMinReadDepth
Fix the bugs of the original PoPoolation, but still use their way of computing the empirical pool siz...
genesis::population::tajima_d_pool_denominator
double tajima_d_pool_denominator(DiversityPoolSettings const &settings, double theta, size_t poolsize, double window_avg_denom, size_t empirical_min_read_depth)
Compute the denominator for the pool-sequencing correction of Tajima's D according to Kofler et al.
Definition: diversity_pool_functions.cpp:520
genesis::utils::squared
constexpr double squared(double x)
Square of a number.
Definition: common.hpp:223
genesis::population::DiversityPoolSettings
Settings used by different pool-sequencing corrected diversity statistics.
Definition: diversity_pool_functions.hpp:141
genesis::population::heterozygosity
double heterozygosity(SampleCounts const &sample, bool with_bessel)
Compute classic heterozygosity.
Definition: diversity_pool_functions.cpp:150
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76