A library for working with phylogenetic and population genetic data.
v0.27.0
structure.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTIONS_STRUCTURE_H_
2 #define GENESIS_POPULATION_FUNCTIONS_STRUCTURE_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <cmath>
42 #include <stdexcept>
43 #include <string>
44 #include <utility>
45 #include <vector>
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // Compute Helper
52 // =================================================================================================
53 
54 // Can only be used with C++14, as we _need_ generic lambda expressions (with `auto` type-specifier
55 // in the parameter list). Otherwise, the type that is actually needed for the range iterators
56 // in the fst_functor() call in the function is literally impossible to capture. That is because
57 // the type contains a TransformIterator, which has a lambda as one of its template parameters,
58 // which hence cannot be named or captured in any form other than a lambda or an auto context...
59 // See https://stackoverflow.com/a/4846597/4184258 for details. See the version history of this
60 // file (which is currently named `genesis/population/functions/structure.hpp`) for the previous
61 // instance of this function that also worked under C++11, by simply copy-pasting the contents of
62 // compute_pairwise_f_st() to the functions where it is used...
63 // That was ungly, and this function is currently not needed for our downstream tools
64 // (e.g., grenedalf, see https://github.com/lczech/grenedalf), so we just leave it active here
65 // when C++14 is used for compiling.
66 
67 #if __cplusplus >= 201402L
68 
110 template<class ForwardIterator, typename FstFunctor>
111 utils::Matrix<double> compute_pairwise_f_st(
112  ForwardIterator begin, ForwardIterator end,
113  FstFunctor fst_functor
114 ) {
115  // With no data, return empty result.
116  if( begin == end ) {
117  return {};
118  }
119 
120  // Now we know that there are entries in the rage. Use the first one to get the number of
121  // base pair samples in the range. We later check that this is the same for each entry.
122  // Use that size to initialize the resulting matrix.
123  size_t const size = static_cast<std::vector<BaseCounts> const&>( *begin ).size();
124  auto result = utils::Matrix<double>( size, size, 0.0 );
125 
126  // We use a lambda that returns a tranforming rage to select an entry at a given index
127  // in the set of BaseCounts at the current iterator position.
128  auto select_entry = [size]( ForwardIterator begin, ForwardIterator end, size_t index ){
129  // Currently, in order to use Window here, we need to explicitly use std::vector<BaseCounts>
130  // instead of the more generic T... Bit unfortunate, but good enough for now.
131  // Will have to revisit later if we get to use cases where the BaseCounts are not stored
132  // in a vector, but some other container.
133  // using T = typename ForwardIterator::value_type;
135  [size, index]( std::vector<BaseCounts> const& samples ) -> BaseCounts const& {
136  if( samples.size() != size ) {
137  throw std::runtime_error(
138  "In compute_pairwise_f_st(): The number of BaseCounts in the "
139  "provided range is not consistent throughout the iteration."
140  );
141  }
142  return samples[index];
143  },
144  begin, end
145  );
146  };
147 
148  // Loop over all pairs of entries, and compute f_st for each of these pairs.
149  // That is, in the inner code of the two loops, we run the f_st function that takes
150  // two ranges, providing it with a pair of indices for which we compute the value.
151  for( size_t i = 0; i < size; ++i ) {
152  for( size_t j = i + 1; j < size; ++j ) {
153  auto range_i = select_entry( begin, end, i );
154  auto range_j = select_entry( begin, end, j );
155  auto const fst = fst_functor(
156  i, j,
157  range_i.begin(), range_i.end(),
158  range_j.begin(), range_j.end()
159  );
160  result( i, j ) = fst;
161  result( j, i ) = fst;
162  }
163  }
164 
165  return result;
166 }
167 
168 #endif // __cplusplus >= 201402L
169 
170 // =================================================================================================
171 // F_ST Pool Kofler
172 // =================================================================================================
173 
181 std::tuple<double, double, double> f_st_pool_kofler_pi_snp(
182  BaseCounts const& p1, BaseCounts const& p2
183 );
184 
201 template<class ForwardIterator1, class ForwardIterator2>
202 double f_st_pool_kofler( // get_conventional_fstcalculator
203  size_t p1_poolsize, size_t p2_poolsize,
204  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
205  ForwardIterator2 p2_begin, ForwardIterator2 p2_end
206 ) {
207  // Edge and error cases
208  if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {
209  throw std::invalid_argument( "Cannot run f_st_pool_kofler() with poolsizes <= 1" );
210  }
211 
212  // Theta Pi values for the two populations and their combination
213  double p1_pi_sum = 0.0;
214  double p2_pi_sum = 0.0;
215  double pp_pi_sum = 0.0;
216 
217  // Iterate the two ranges in parallel. Each iteration is one position in the genome.
218  // In each iteration, p1_it and p2_it point at BaseCounts objects containing nucleotide counts.
219  auto p1_it = p1_begin;
220  auto p2_it = p2_begin;
221  while( p1_it != p1_end && p2_it != p2_end ) {
222 
223  // Compute frequency based pi snps. The tuple returns p1, p2, pp, in that order.
224  auto const pi_snps = f_st_pool_kofler_pi_snp( *p1_it, *p2_it );
225 
226  // Skip invalid entries than can happen when less than two of [ACGT] have counts > 0
227  // in one of the BaseCounts samples.
228  if(
229  std::isfinite( std::get<0>( pi_snps )) &&
230  std::isfinite( std::get<1>( pi_snps )) &&
231  std::isfinite( std::get<2>( pi_snps ))
232  ) {
233  // If we are here, both p1 and p2 have counts. Let's assert.
234  assert( p1_it->a_count + p1_it->c_count + p1_it->g_count + p1_it->t_count > 0 );
235  assert( p2_it->a_count + p2_it->c_count + p2_it->g_count + p2_it->t_count > 0 );
236 
237  // Now add them to the tally.
238  p1_pi_sum += std::get<0>( pi_snps );
239  p2_pi_sum += std::get<1>( pi_snps );
240  pp_pi_sum += std::get<2>( pi_snps );
241  } else {
242  // If we are here, at least one of the two inputs has one or fewer counts in [ACGT],
243  // otherwise, the results would have been finite. Let's assert this.
244  assert(
245  ( p1_it->a_count + p1_it->c_count + p1_it->g_count + p1_it->t_count <= 1 ) ||
246  ( p2_it->a_count + p2_it->c_count + p2_it->g_count + p2_it->t_count <= 1 )
247  );
248  }
249 
250  // Next pair of entries.
251  ++p1_it;
252  ++p2_it;
253  }
254  if( p1_it != p1_end || p2_it != p2_end ) {
255  throw std::invalid_argument(
256  "In f_st_pool_kofler(): Provided ranges have different length."
257  );
258  }
259 
260  // Normalize by poolsize
261  assert( p1_poolsize > 1 && p2_poolsize > 1 );
262  size_t const pp_poolsize = std::min( p1_poolsize, p2_poolsize );
263  p1_pi_sum *= static_cast<double>( p1_poolsize ) / static_cast<double>( p1_poolsize - 1 );
264  p2_pi_sum *= static_cast<double>( p2_poolsize ) / static_cast<double>( p2_poolsize - 1 );
265  pp_pi_sum *= static_cast<double>( pp_poolsize ) / static_cast<double>( pp_poolsize - 1 );
266 
267  // _calculateFstValues
268  double const pp_avg = ( p1_pi_sum + p2_pi_sum ) / 2.0;
269  return ( pp_pi_sum - pp_avg ) / pp_pi_sum;
270 }
271 
272 #if __cplusplus >= 201402L
273 
281 template<class ForwardIterator>
283  std::vector<size_t> const& poolsizes,
284  ForwardIterator begin, ForwardIterator end
285 ) {
286  return compute_pairwise_f_st(
287  begin, end,
288  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
289  if( i >= poolsizes.size() || j >= poolsizes.size() ) {
290  throw std::runtime_error(
291  "In f_st_pool_kofler(): Provided ranges have different lengths that "
292  "are not identical to the number of poolsizes provided."
293  );
294  }
295  return f_st_pool_kofler(
296  poolsizes[i], poolsizes[j],
297  p1_begin, p1_end, p2_begin, p2_end
298  );
299  }
300  );
301 }
302 
303 #endif // __cplusplus >= 201402L
304 
305 // =================================================================================================
306 // F_ST Pool Karlsson
307 // =================================================================================================
308 
316 std::pair<double, double> f_st_pool_karlsson_nkdk(
317  std::pair<SortedBaseCounts, SortedBaseCounts> const& sample_counts
318 );
319 
338 template<class ForwardIterator1, class ForwardIterator2>
339 double f_st_pool_karlsson( // get_asymptunbiased_fstcalculator
340  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
341  ForwardIterator2 p2_begin, ForwardIterator2 p2_end
342 ) {
343  using namespace genesis::utils;
344 
345  // Result value.
346  double sum_nk = 0.0;
347  double sum_dk = 0.0;
348 
349  // Iterate both ranges, summing up N_k and D_k for all their entries.
350  auto p1_it = p1_begin;
351  auto p2_it = p2_begin;
352  while( p1_it != p1_end && p2_it != p2_end ) {
353 
354  // Get intermediate values and add them up.
355  auto const counts = sorted_average_base_counts( *p1_it, *p2_it );
356  auto const nkdk = f_st_pool_karlsson_nkdk( counts );
357  sum_nk += nkdk.first;
358  sum_dk += nkdk.second;
359 
360  // Next pair of entries
361  ++p1_it;
362  ++p2_it;
363  }
364  if( p1_it != p1_end || p2_it != p2_end ) {
365  throw std::invalid_argument(
366  "In f_st_pool_karlsson(): Provided ranges have different length."
367  );
368  }
369 
370  return sum_nk / sum_dk;
371 }
372 
373 #if __cplusplus >= 201402L
374 
382 template<class ForwardIterator>
384  ForwardIterator begin, ForwardIterator end
385 ) {
386  return compute_pairwise_f_st(
387  begin, end,
388  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
389  (void) i;
390  (void) j;
391  return f_st_pool_karlsson(
392  p1_begin, p1_end, p2_begin, p2_end
393  );
394  }
395  );
396 }
397 
398 #endif // __cplusplus >= 201402L
399 
400 // =================================================================================================
401 // F_ST Pool Unbiased (Spence)
402 // =================================================================================================
403 
410 std::tuple<double, double, double> f_st_pool_unbiased_pi_snp(
411  size_t p1_poolsize, size_t p2_poolsize,
412  BaseCounts const& p1, BaseCounts const& p2
413 );
414 
432 template<class ForwardIterator1, class ForwardIterator2>
433 std::pair<double, double> f_st_pool_unbiased(
434  size_t p1_poolsize, size_t p2_poolsize,
435  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
436  ForwardIterator2 p2_begin, ForwardIterator2 p2_end
437 ) {
438  // Edge and error cases
439  if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {
440  throw std::invalid_argument( "Cannot run f_st_pool_unbiased() with poolsizes <= 1" );
441  }
442 
443  // Sums over the window of pi within, between, total.
444  double pi_w_sum = 0.0;
445  double pi_b_sum = 0.0;
446  double pi_t_sum = 0.0;
447 
448  // Iterate the two ranges in parallel. Each iteration is one position in the genome.
449  // In each iteration, p1_it and p2_it point at BaseCounts objects containing nucleotide counts.
450  auto p1_it = p1_begin;
451  auto p2_it = p2_begin;
452  while( p1_it != p1_end && p2_it != p2_end ) {
453 
454  // Compute pi values for the snp.
455  // The tuple `pi_snp` returns pi within, between, and total, in that order.
456  auto const pi_snp = f_st_pool_unbiased_pi_snp( p1_poolsize, p2_poolsize, *p1_it, *p2_it );
457 
458  // Skip invalid entries than can happen when less than two of [ACGT] have
459  // counts > 0 in one of the BaseCounts samples.
460  if(
461  std::isfinite( std::get<0>( pi_snp )) &&
462  std::isfinite( std::get<1>( pi_snp )) &&
463  std::isfinite( std::get<2>( pi_snp ))
464  ) {
465  // If we are here, both p1 and p2 have counts. Let's assert.
466  assert( p1_it->a_count + p1_it->c_count + p1_it->g_count + p1_it->t_count > 0 );
467  assert( p2_it->a_count + p2_it->c_count + p2_it->g_count + p2_it->t_count > 0 );
468 
469  // Now add them to the tally.
470  pi_w_sum += std::get<0>( pi_snp );
471  pi_b_sum += std::get<1>( pi_snp );
472  pi_t_sum += std::get<2>( pi_snp );
473  } else {
474  // If we are here, at least one of the two inputs has one or fewer counts in [ACGT],
475  // otherwise, the results would have been finite. Let's assert this.
476  assert(
477  ( p1_it->a_count + p1_it->c_count + p1_it->g_count + p1_it->t_count <= 1 ) ||
478  ( p2_it->a_count + p2_it->c_count + p2_it->g_count + p2_it->t_count <= 1 )
479  );
480  }
481 
482  // Next pair of entries.
483  ++p1_it;
484  ++p2_it;
485  }
486  if( p1_it != p1_end || p2_it != p2_end ) {
487  throw std::invalid_argument(
488  "In f_st_pool_unbiased(): Provided ranges have different length."
489  );
490  }
491 
492  // Final computation of our two FST estimators, using Nei and Hudson, respectively.
493  double const fst_nei = 1.0 - ( pi_w_sum / pi_t_sum );
494  double const fst_hud = 1.0 - ( pi_w_sum / pi_b_sum );
495  return { fst_nei, fst_hud };
496 }
497 
498 #if __cplusplus >= 201402L
499 
510 template<class ForwardIterator>
511 utils::Matrix<double> f_st_pool_unbiased_nei(
512  std::vector<size_t> const& poolsizes,
513  ForwardIterator begin, ForwardIterator end
514 ) {
515  return compute_pairwise_f_st(
516  begin, end,
517  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
518  if( i >= poolsizes.size() || j >= poolsizes.size() ) {
519  throw std::runtime_error(
520  "In f_st_pool_unbiased_nei(): Provided ranges have different lengths that "
521  "are not identical to the number of poolsizes provided."
522  );
523  }
524  return f_st_pool_unbiased(
525  poolsizes[i], poolsizes[j],
526  p1_begin, p1_end, p2_begin, p2_end
527  ).first;
528  }
529  );
530 }
531 
542 template<class ForwardIterator>
543 utils::Matrix<double> f_st_pool_unbiased_hudson(
544  std::vector<size_t> const& poolsizes,
545  ForwardIterator begin, ForwardIterator end
546 ) {
547  return compute_pairwise_f_st(
548  begin, end,
549  [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){
550  if( i >= poolsizes.size() || j >= poolsizes.size() ) {
551  throw std::runtime_error(
552  "In f_st_pool_unbiased_hudson(): Provided ranges have different lengths that "
553  "are not identical to the number of poolsizes provided."
554  );
555  }
556  return f_st_pool_unbiased(
557  poolsizes[i], poolsizes[j],
558  p1_begin, p1_end, p2_begin, p2_end
559  ).second;
560  }
561  );
562 }
563 
564 #endif // __cplusplus >= 201402L
565 
566 } // namespace population
567 } // namespace genesis
568 
569 #endif // include guard
genesis::population::f_st_pool_kofler_pi_snp
std::tuple< double, double, double > f_st_pool_kofler_pi_snp(BaseCounts const &p1, BaseCounts const &p2)
Compute the SNP-based Theta Pi values used in f_st_pool_kofler().
Definition: structure.cpp:46
genesis::population::f_st_pool_karlsson_nkdk
std::pair< double, double > f_st_pool_karlsson_nkdk(std::pair< SortedBaseCounts, SortedBaseCounts > const &sample_counts)
Compute the numerator N_k and denominator D_k needed for the asymptotically unbiased F_ST estimator o...
Definition: structure.cpp:101
genesis::population::f_st_pool_karlsson
double f_st_pool_karlsson(ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end)
Compute the F_ST statistic for pool-sequenced data of Karlsson et al as used in PoPoolation2,...
Definition: structure.hpp:339
genesis::population::f_st_pool_kofler
double f_st_pool_kofler(size_t p1_poolsize, size_t p2_poolsize, ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end)
Compute the F_ST statistic for pool-sequenced data of Kofler et al as used in PoPoolation2,...
Definition: structure.hpp:202
genesis::population::sorted_average_base_counts
std::pair< SortedBaseCounts, SortedBaseCounts > sorted_average_base_counts(BaseCounts const &sample_a, BaseCounts const &sample_b)
Return the sorted base counts of both input samples, orderd by the average frequencies of the nucleot...
Definition: population/functions/functions.cpp:221
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::utils::Matrix< double >
matrix.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
transform_iterator.hpp
genesis::population::f_st_pool_unbiased
std::pair< double, double > f_st_pool_unbiased(size_t p1_poolsize, size_t p2_poolsize, ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end)
Compute our unbiased F_ST statistic for pool-sequenced data for two ranges of BaseCountss.
Definition: structure.hpp:433
genesis::utils::make_transform_range
Range< TransformIterator< TransformFunctor, BaseIterator > > make_transform_range(TransformFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a transforming range, given the transformation function as well as the underlying base iter...
Definition: transform_iterator.hpp:438
variant.hpp
genesis::population::f_st_pool_unbiased_pi_snp
std::tuple< double, double, double > f_st_pool_unbiased_pi_snp(size_t p1_poolsize, size_t p2_poolsize, BaseCounts const &p1_counts, BaseCounts const &p2_counts)
Compute the SNP-based Theta Pi values used in f_st_pool_unbiased().
Definition: structure.cpp:166
functions.hpp