A library for working with phylogenetic data.
v0.25.0
structure.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTIONS_STRUCTURE_H_
2 #define GENESIS_POPULATION_FUNCTIONS_STRUCTURE_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2021 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <cmath>
42 #include <stdexcept>
43 #include <string>
44 #include <utility>
45 #include <vector>
46 
47 namespace genesis {
48 namespace population {
49 
50 // =================================================================================================
51 // F_ST Conventional
52 // =================================================================================================
53 
61 std::tuple<double, double, double> f_st_conventional_pool_pi_snp(
62  BaseCounts const& p1, BaseCounts const& p2
63 );
64 
69 template<class ForwardIterator1, class ForwardIterator2>
70 double f_st_conventional_pool( // get_conventional_fstcalculator
71  size_t p1_poolsize, size_t p2_poolsize,
72  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
73  ForwardIterator2 p2_begin, ForwardIterator2 p2_end
74 ) {
75  // Edge and error cases
76  if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {
77  throw std::invalid_argument( "Cannot run f_st_conventional_pool() with poolsizes <= 1" );
78  }
79 
80  // Theta Pi values for the two populations and their combination
81  double p1_pi_sum = 0.0;
82  double p2_pi_sum = 0.0;
83  double pp_pi_sum = 0.0;
84 
85  auto p1_it = p1_begin;
86  auto p2_it = p2_begin;
87  while( p1_it != p1_end && p2_it != p2_end ) {
88 
89  // Compute frequency based pi snps. The tuple returns p1, p2, pp, in that order.
90  auto const pi_snps = f_st_conventional_pool_pi_snp( *p1_it, *p2_it );
91 
92  // Skip invalid entries than can happen when less than two of [ACGT] have counts > 0
93  // in one of the BaseCounts samples.
94  if(
95  std::isfinite( std::get<0>( pi_snps )) &&
96  std::isfinite( std::get<1>( pi_snps )) &&
97  std::isfinite( std::get<2>( pi_snps ))
98  ) {
99  // If we are here, both p1 and p2 have counts. Let's assert.
100  assert( p1_it->a_count + p1_it->c_count + p1_it->g_count + p1_it->t_count > 0 );
101  assert( p2_it->a_count + p2_it->c_count + p2_it->g_count + p2_it->t_count > 0 );
102 
103  // Now add them to the tally.
104  p1_pi_sum += std::get<0>( pi_snps );
105  p2_pi_sum += std::get<1>( pi_snps );
106  pp_pi_sum += std::get<2>( pi_snps );
107  } else {
108  // If we are here, at least one of the two inputs has one or fewer counts in [ACGT],
109  // otherwise, the results would have been finite. Let's assert this.
110  assert(
111  ( p1_it->a_count + p1_it->c_count + p1_it->g_count + p1_it->t_count <= 1 ) ||
112  ( p2_it->a_count + p2_it->c_count + p2_it->g_count + p2_it->t_count <= 1 )
113  );
114  }
115 
116  // Next pair of entries
117  ++p1_it;
118  ++p2_it;
119  }
120  if( p1_it != p1_end || p2_it != p2_end ) {
121  throw std::invalid_argument(
122  "In f_st_conventional_pool(): Provided ranges have different length."
123  );
124  }
125 
126  // Normalize by poolsize
127  assert( p1_poolsize > 1 && p2_poolsize > 1 );
128  size_t const pp_poolsize = std::min( p1_poolsize, p2_poolsize );
129  p1_pi_sum *= static_cast<double>( p1_poolsize ) / static_cast<double>( p1_poolsize - 1 );
130  p2_pi_sum *= static_cast<double>( p2_poolsize ) / static_cast<double>( p2_poolsize - 1 );
131  pp_pi_sum *= static_cast<double>( pp_poolsize ) / static_cast<double>( pp_poolsize - 1 );
132 
133  // _calculateFstValues
134  double const pp_avg = ( p1_pi_sum + p2_pi_sum ) / 2.0;
135  return ( pp_pi_sum - pp_avg ) / pp_pi_sum;
136 }
137 
155 template<class ForwardIterator>
157  std::vector<size_t> const& poolsizes,
158  ForwardIterator begin, ForwardIterator end
159 ) {
160  auto result = utils::Matrix<double>( poolsizes.size(), poolsizes.size(), 0.0 );
161 
162  // We use a lambda that returns a tranforming rage to select an entry at a given index
163  // in the set of BaseCounts at the current iterator position.
164  auto const pss = poolsizes.size();
165  auto select_entry = [pss]( ForwardIterator begin, ForwardIterator end, size_t index ){
166  // Currently, in order to use Window here, we need to explicitly use std::vector<BaseCounts>
167  // instead of the more generic T... Bit unfortunate, but good enough for now.
168  // Will have to revisit later if we get to use cases where the BaseCounts are not stored
169  // in a vector, but some other container.
170  // using T = typename ForwardIterator::value_type;
172  [pss, index]( std::vector<BaseCounts> const& pools ) -> BaseCounts const& {
173  if( pools.size() != pss ) {
174  throw std::runtime_error(
175  "In f_st_conventional_pool(): Provided number of poolsizes (" +
176  std::to_string( pss ) + ") is not equal to the number of BaseCounts (" +
177  std::to_string( pools.size() ) + ") at some point in the iteration"
178  );
179  }
180  return pools[index];
181  },
182  begin, end
183  );
184  };
185 
186  // Loop over all pairs of entries, and compute f_st for each of these pairs.
187  // That is, in the inner code of the two loops, we run the f_st function that takes
188  // two ranges, providing it with a pair of indices for which we compute the value.
189  for( size_t i = 0; i < poolsizes.size(); ++i ) {
190  for( size_t j = i + 1; j < poolsizes.size(); ++j ) {
191  auto range_i = select_entry( begin, end, i );
192  auto range_j = select_entry( begin, end, j );
193  auto const fst = f_st_conventional_pool(
194  poolsizes[i], poolsizes[j],
195  range_i.begin(), range_i.end(),
196  range_j.begin(), range_j.end()
197  );
198  result( i, j ) = fst;
199  result( j, i ) = fst;
200  }
201  }
202 
203  return result;
204 }
205 
215 template<class ForwardIterator>
217  size_t number_of_samples, size_t poolsizes,
218  ForwardIterator begin, ForwardIterator end
219 ) {
220  // With no data, return empty result
221  if( begin == end ) {
222  return {};
223  }
224 
225  // With data: get the number of samples, and use that to fill a vector with identical pool sizes.
226  // Then, use the other version of this function to compute the result.
227  auto const ps = std::vector<size_t>( number_of_samples, poolsizes );
228  return f_st_conventional_pool( ps, begin, end );
229 }
230 
231 // =================================================================================================
232 // F_ST Asymptotically Unbiased (Karlsson)
233 // =================================================================================================
234 
239 struct FstAN
240 {
241  double a_1 = 0.0;
242  double n_1 = 0.0;
243  double a_2 = 0.0;
244  double n_2 = 0.0;
245 };
246 
250 inline bool operator ==( FstAN const& f1, FstAN const& f2 )
251 {
252  return f1.a_1 == f2.a_1 && f1.n_1 == f2.n_1 && f1.a_2 == f2.a_2 && f1.n_2 == f2.n_2;
253 }
254 
261 FstAN f_st_asymptotically_unbiased_a1n1a2n2( BaseCounts const& p1, BaseCounts const& p2 );
262 
269 std::pair<double, double> f_st_asymptotically_unbiased_nkdk( FstAN const& fstan );
270 
276 template<class ForwardIterator1, class ForwardIterator2>
277 double f_st_asymptotically_unbiased( // get_asymptunbiased_fstcalculator
278  ForwardIterator1 p1_begin, ForwardIterator1 p1_end,
279  ForwardIterator2 p2_begin, ForwardIterator2 p2_end
280 ) {
281  using namespace genesis::utils;
282 
283  // Result value.
284  double sum_nk = 0.0;
285  double sum_dk = 0.0;
286 
287  // Iterate both ranges, summing up N_k and D_k for all their entries.
288  auto p1_it = p1_begin;
289  auto p2_it = p2_begin;
290  while( p1_it != p1_end && p2_it != p2_end ) {
291 
292  // Get intermediate values and add them up.
293  auto const anan = f_st_asymptotically_unbiased_a1n1a2n2( *p1_it, *p2_it );
294  auto const nkdk = f_st_asymptotically_unbiased_nkdk( anan );
295  sum_nk += nkdk.first;
296  sum_dk += nkdk.second;
297 
298  // Next pair of entries
299  ++p1_it;
300  ++p2_it;
301  }
302  if( p1_it != p1_end || p2_it != p2_end ) {
303  throw std::invalid_argument(
304  "In f_st_asymptotically_unbiased(): Provided ranges have different length."
305  );
306  }
307 
308  return sum_nk / sum_dk;
309 }
310 
327 template<class ForwardIterator>
329  ForwardIterator begin, ForwardIterator end
330 ) {
331  // With no data, return empty result
332  if( begin == end ) {
333  return {};
334  }
335 
336  // Now we now that there are entries in the rage. Use the first one to get the number of
337  // pool samples in the ragen. We later check that this is the same for each entry in the range.
338  // Use that size to initialize the resulting matrix.
339  size_t const ps = static_cast<std::vector<BaseCounts> const&>( *begin ).size();
340  auto result = utils::Matrix<double>( ps, ps, 0.0 );
341 
342  // We use a lambda that returns a tranforming rage to select an entry at a given index
343  // in the set of BaseCounts at the current iterator position.
344  auto select_entry = [ps]( ForwardIterator begin, ForwardIterator end, size_t index ){
346  [ps, index]( std::vector<BaseCounts> const& pools ) -> BaseCounts const& {
347  if( pools.size() != ps ) {
348  throw std::runtime_error(
349  "In f_st_asymptotically_unbiased(): The number of BaseCounts in the "
350  "provided range is not consistent"
351  );
352  }
353  return pools[index];
354  },
355  begin, end
356  );
357  };
358 
359  // Loop over all pairs of entries, and compute f_st for each of these pairs.
360  // That is, in the inner code of the two loops, we run the f_st function that takes
361  // two ranges, providing it with a pair of indices for which we compute the value.
362  for( size_t i = 0; i < ps; ++i ) {
363  for( size_t j = i + 1; j < ps; ++j ) {
364  auto range_i = select_entry( begin, end, i );
365  auto range_j = select_entry( begin, end, j );
366  auto const fst = f_st_asymptotically_unbiased(
367  range_i.begin(), range_i.end(),
368  range_j.begin(), range_j.end()
369  );
370  result( i, j ) = fst;
371  result( j, i ) = fst;
372  }
373  }
374 
375  return result;
376 }
377 
378 } // namespace population
379 } // namespace genesis
380 
381 #endif // include guard
genesis::population::f_st_asymptotically_unbiased
double f_st_asymptotically_unbiased(ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end)
Compute the asymptotically unbiased F_ST estimator of Karlsson et al.
Definition: structure.hpp:277
genesis::population::FstAN
Helper struct for the a_1, a_2, n_1, and n_2 values needed for f_st_asymptotically_unbiased().
Definition: structure.hpp:239
variant.hpp
genesis::population::FstAN::n_1
double n_1
Definition: structure.hpp:242
genesis::population::f_st_conventional_pool_pi_snp
std::tuple< double, double, double > f_st_conventional_pool_pi_snp(BaseCounts const &p1, BaseCounts const &p2)
Compute the SNP-based Theta Pi values used in f_st_conventional_pool()
Definition: structure.cpp:47
genesis::population::FstAN::n_2
double n_2
Definition: structure.hpp:244
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::utils::Matrix< double >
genesis::population::FstAN::a_2
double a_2
Definition: structure.hpp:243
genesis::population::operator==
bool operator==(GenomeRegion const &a, GenomeRegion const &b)
Definition: functions/genome_region.cpp:70
matrix.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
transform_iterator.hpp
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:53
genesis::population::f_st_conventional_pool
double f_st_conventional_pool(size_t p1_poolsize, size_t p2_poolsize, ForwardIterator1 p1_begin, ForwardIterator1 p1_end, ForwardIterator2 p2_begin, ForwardIterator2 p2_end)
Compute the conventional F_ST statistic for pool-sequenced data, following Kofler et al,...
Definition: structure.hpp:70
genesis::utils::make_transform_range
Range< TransformIterator< TransformFunctor, BaseIterator > > make_transform_range(TransformFunctor unary_func, BaseIterator begin, BaseIterator end)
Construct a transforming range, given the transformation function as well as the underlying base iter...
Definition: transform_iterator.hpp:438
variant.hpp
genesis::population::FstAN::a_1
double a_1
Definition: structure.hpp:241
genesis::population::f_st_asymptotically_unbiased_nkdk
std::pair< double, double > f_st_asymptotically_unbiased_nkdk(FstAN const &fstan)
Compute the N_k and D_k values needed for the asymptotically unbiased F_ST estimator of Karlsson et a...
Definition: structure.cpp:199
genesis::population::f_st_asymptotically_unbiased_a1n1a2n2
FstAN f_st_asymptotically_unbiased_a1n1a2n2(BaseCounts const &p1, BaseCounts const &p2)
Compute the a and n values needed for the asymptotically unbiased F_ST estimator of Karlsson et al.
Definition: structure.cpp:102