41 #include <unordered_map>
48 namespace population {
65 size_t nucleotide_count,
66 size_t allele_frequency
79 size_t poolsize,
size_t nucleotide_count,
size_t allele_frequency
84 for(
size_t r = 1; r <= poolsize - 1; ++r ) {
85 double const p =
static_cast<double>( r ) /
static_cast<double>( poolsize );
93 allele_frequency, nucleotide_count, p,
true
95 double const partial = binom /
static_cast<double>( r );
103 return amnm_cache_( poolsize, nucleotide_count, allele_frequency );
115 double const nt_cnt =
static_cast<double>(
nucleotide_sum( sample ));
127 h *= nt_cnt / ( nt_cnt - 1.0 );
138 size_t nucleotide_count
147 size_t poolsize,
size_t min_allele_count,
size_t nucleotide_count
150 if( 2 * min_allele_count > nucleotide_count ) {
151 throw std::invalid_argument(
152 "Cannot compute theta_pi_pool_denominator with min_allele_count = " +
161 #pragma omp parallel for
162 for(
size_t m_it = min_allele_count; m_it <= ( nucleotide_count - min_allele_count ); ++m_it ) {
165 double const m =
static_cast<double>( m_it );
166 double const M =
static_cast<double>( nucleotide_count );
169 double const term = ( 2.0 * m * ( M - m )) / ( M * ( M - 1.0 ));
170 double const partial = term *
amnm_( poolsize, nucleotide_count, m_it );
188 size_t nucleotide_count
197 size_t poolsize,
size_t min_allele_count,
size_t nucleotide_count
200 if( 2 * min_allele_count > nucleotide_count ) {
201 throw std::invalid_argument(
202 "Cannot compute theta_watterson_pool_denominator with min_allele_count = " +
211 #pragma omp parallel for
212 for(
size_t m_it = min_allele_count; m_it <= ( nucleotide_count - min_allele_count ); ++m_it ) {
215 auto const anmn =
amnm_( poolsize, nucleotide_count, m_it );
236 for(
size_t i = 1; i < n; ++i ) {
237 sum += 1.0 /
static_cast<double>( i );
241 return a_n_cache_( n );
249 for(
size_t i = 1; i < n; ++i ) {
250 sum += 1.0 / (
static_cast<double>( i ) *
static_cast<double>( i ));
254 return b_n_cache_( n );
259 double const nd =
static_cast<double>( n );
260 return ( nd - 3.0 ) / (
a_n * ( nd - 1.0 ) - nd );
266 throw std::invalid_argument(
"Cannot compute alpha star with effective coverage n <= 1" );
274 double const nd =
static_cast<double>( n );
275 double const an =
a_n( n );
276 double const fs =
f_star( an, n );
279 auto const t1 =
squared( fs ) * ( an - ( nd / ( nd - 1.0 )));
280 auto const t2s1 = an * (( 4.0 * ( nd + 1.0 )) /
squared( nd - 1.0 ));
281 auto const t2s2 = 2.0 * (( nd + 3.0 ) / ( nd - 1.0 ));
282 auto const t2 = fs * ( t2s1 - t2s2 );
283 auto const t3 = an * (( 8.0 * ( nd + 1.0 )) / ( nd *
squared( nd - 1.0 )));
284 auto const t4 = (
squared( nd ) + nd + 60.0 ) / ( 3.0 * nd * ( nd - 1.0 ));
285 return t1 + t2 - t3 + t4;
288 return alpha_star_cache_( n );
294 throw std::invalid_argument(
"Cannot compute beta star with effective coverage n <= 1" );
302 double const nd =
static_cast<double>( n );
303 double const an =
a_n( n );
304 double const bn =
b_n( n );
305 double const fs =
f_star( an, n );
308 auto const t1 =
squared( fs ) * ( bn - (( 2.0 * ( nd - 1.0 )) /
squared( nd - 1.0 )));
309 auto const t2s1 = bn * ( 8.0 / ( nd - 1.0 ));
310 auto const t2s2 = an * ( 4.0 / ( nd * ( nd - 1.0 )));
311 auto const t2s3n =
cubed( nd ) + 12.0 *
squared( nd ) - 35.0 * nd + 18.0;
312 auto const t2s3d = nd *
squared( nd - 1.0 );
313 auto const t2s3 = t2s3n / t2s3d;
314 auto const t2 = fs * ( t2s1 - t2s2 - t2s3 );
315 auto const t3 = bn * ( 16.0 / ( nd * ( nd - 1.0 )));
316 auto const t4 = an * ( 8.0 / (
squared( nd ) * ( nd - 1.0 )));
317 auto const t5s1 = 2.0 * ( std::pow( nd, 4 ) + 110.0 *
squared( nd ) - 255.0 * nd + 126.0 );
319 auto const t5 = t5s1 / t5s2;
320 return t1 + t2 - t3 + t4 + t5;
323 return beta_star_cache_( n );
327 size_t max_coverage,
size_t poolsize
333 auto const max_width = poolsize;
337 double const poold =
static_cast<double>( poolsize );
340 result( 0, 0 ) = 1.0;
341 for(
size_t i = 1; i < max_coverage + 1; ++i ) {
342 result( i, 0 ) = 0.0;
344 for(
size_t j = 1; j < max_width + 1; ++j ) {
345 result( 0, j ) = 0.0;
349 for(
size_t i = 1; i < max_coverage + 1; ++i ) {
350 for(
size_t j = 1; j < max_width + 1; ++j ) {
351 auto const t1s1 = (( 1.0 + poold -
static_cast<double>( j )) / poold );
352 auto const t1s2 = result( i - 1, j - 1 );
353 auto const t2 = (
static_cast<double>( j ) / poold ) * result( i-1, j );
354 result( i, j ) = t1s1 * t1s2 + t2;
361 size_t max_coverage,
size_t poolsize
375 static std::unordered_map<size_t, PijMatrix> pij_matrix_cache_;
378 static std::mutex guard_;
379 std::lock_guard<std::mutex> lock( guard_ );
383 pij_matrix_cache_.count( poolsize ) == 0 ||
384 max_coverage >= pij_matrix_cache_.at( poolsize ).rows() ||
385 poolsize + 1 != pij_matrix_cache_.at( poolsize ).cols()
389 pij_matrix_cache_[ poolsize ] =
pij_matrix_( 3 * max_coverage, poolsize );
392 assert( pij_matrix_cache_.count( poolsize ) > 0 );
393 assert( max_coverage <= pij_matrix_cache_.at( poolsize ).rows() );
394 assert( poolsize <= pij_matrix_cache_.at( poolsize ).cols() );
395 return pij_matrix_cache_[ poolsize ];
413 size_t coverage,
size_t poolsize
420 auto const minj = ( coverage < poolsize ) ? coverage : poolsize;
421 for(
size_t k = 1; k <= minj; ++k ) {
422 assert( coverage < pij_matrix.rows() );
423 assert( k < pij_matrix.cols() );
424 nbase +=
static_cast<double>( k ) * pij_matrix( coverage, k );
429 return nbase_cache_( coverage, poolsize );
432 double n_base(
size_t coverage,
size_t poolsize )
442 double const p =
static_cast<double>( coverage );
443 double const n =
static_cast<double>( poolsize );
444 return n * ( 1.0 - std::pow(( n - 1.0 ) / n, p ));
465 throw std::invalid_argument(
466 "Minimum allele count needs to be set to 2 for calculating pool-corrected Tajima's D "
467 "with tajima_d_pool() according to Kofler et al. In case 2 is insufficient, "
468 "we recommend to subsample the reads to a smaller coverage."
472 throw std::invalid_argument(
473 "Invalid mincoverage >> poolsize (as internal aproximation we use: "
474 "3 * minimumcoverage < poolsize) in tajima_d_pool()"
490 alphastar =
static_cast<double>(
beta_star( avg_n ));
491 betastar = alphastar;
494 alphastar =
static_cast<double>(
alpha_star( avg_n ));
495 betastar =
static_cast<double>(
beta_star( avg_n ));;
499 ( alphastar /
static_cast<double>( snp_count )) * theta + betastar *
squared( theta )