A library for working with phylogenetic and population genetic data.
v0.27.0
afs_estimate.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <cassert>
36 #include <cmath>
37 #include <cstdint>
38 #include <numeric>
39 #include <stdexcept>
40 
41 namespace genesis {
42 namespace population {
43 
44 // =================================================================================================
45 // Allele Frequency Spectrum Estimate
46 // =================================================================================================
47 
49 {
50  // Prepare return value.
51  AfsPileupRecord result;
52 
53  // We expect the pileup file to contain exactly one sample. No mpileup here.
54  if( record.samples.size() != 1 ) {
55  throw std::runtime_error(
56  "Invalid pileup record line that does not contain exactly one sample."
57  );
58  }
59 
60  // Get the sample stored in the record and check it.
61  auto const& smp = record.samples[0];
62  if( smp.read_bases.size() != smp.phred_scores.size() ) {
63  throw std::runtime_error(
64  "Invalid pileup record line where number of read bases "
65  "differs from number of phred scores."
66  );
67  }
68 
69  // We might reserve too much here, if not all bases are either ancestral or derived.
70  // That should not matter all too much in terms of memory - but is better for speed,
71  // so let's do it this way.
72  auto const base_cnt = smp.read_bases.size();
73  result.alleles.reserve( base_cnt );
74  result.phred_scores.reserve( base_cnt );
75 
76  // Helper function to check that the ancestral/reference base is valid.
77  auto check_ancestral_base_char_ = []( char r ){
78  if( r != 'A' && r != 'C' && r != 'G' && r != 'T' && r != 'N' ) {
79  throw std::runtime_error(
80  "Invalid ancestral base in pileup record line that is not in [ACGTN]."
81  );
82  }
83  };
84 
85  // Get the derived base, depending on what our input is.
86  char derived_base;
87  if( smp.ancestral_base != '\0' ) {
88  check_ancestral_base_char_( smp.ancestral_base );
89  derived_base = smp.ancestral_base;
90  } else if( record.reference_base != '\0' ) {
91  check_ancestral_base_char_( record.reference_base );
92  derived_base = record.reference_base;
93 
94  // TODO: else folded: simply use the most common as anc, and second most common (if present)
95  // as derived allele. could use the base counts code here again, if we made AfsPileupRecord
96  // derived from BaseCounts.
97 
98  } else {
99  throw std::runtime_error(
100  "Cannot convert pileup record line for allele frequency estimation, "
101  "as it misses the ancestral allele information."
102  );
103  }
104 
105  // Convert bases to just bool values of whether they are ancestral or derived,
106  // and skip all others.
107  for( size_t i = 0; i < base_cnt; ++i ) {
108  if( smp.read_bases[i] == record.reference_base ) {
109  result.alleles.emplace_back( false );
110  result.phred_scores.emplace_back( smp.phred_scores[i] );
111  } else if( smp.read_bases[i] == derived_base ) {
112  result.alleles.emplace_back( true );
113  result.phred_scores.emplace_back( smp.phred_scores[i] );
114  }
115  }
116  assert( result.alleles.size() == result.phred_scores.size() );
117 
118  return result;
119 }
120 
121 std::vector<double> prob_cond_true_freq(
122  size_t n, // n: number of haplotypes in the pool
123  std::vector<bool> const& alleles, // vo: vector of alleles (1 if derived, 0 otherwise), size r
124  std::vector<unsigned char> const& phred_scores, // SE: vector of error probabilities, size r
125  bool unfolded // unfolded: 1 if unfolded, 0 else
126 ) {
127  // result p: conditional prob of vo given z,r,SE,unfolded. size n+1*1
128  if( unfolded ) {
129  return prob_cond_true_freq_unfolded( n, alleles, phred_scores );
130  } else {
131  // Compute for alleles and their inverse (ancestral and derived flipped), then average.
132  auto const prob1 = prob_cond_true_freq_unfolded( n, alleles, phred_scores, false );
133  auto const prob2 = prob_cond_true_freq_unfolded( n, alleles, phred_scores, true );
134  assert( prob1.size() == n + 1 );
135  assert( prob2.size() == n + 1 );
136 
137  auto result = std::vector<double>( n + 1 );
138  for( size_t i = 0; i < n + 1; ++i ) {
139  result[i] = ( prob1[i] + prob2[i] ) / 2.0;
140  }
141  return result;
142  }
143 }
144 
145 std::vector<double> prob_cond_true_freq_unfolded(
146  size_t n, // n: number of haplotypes in the pool
147  std::vector<bool> const& alleles, // vo: vector of alleles (1 if derived, 0 otherwise), size r
148  std::vector<unsigned char> const& phred_scores, // SE: vector of error probabilities, size r
149  bool invert_alleles
150 ) {
151  // Prepare result p: conditional prob of vo given z,r,SE,unfolded. size n+1*1
152  // auto result = std::vector<double>( n + 1, 0.0 );
153  auto result = std::vector<double>( n + 1, 1.0 );
154 
155  // r: number of reads at the position
156  // nd: n converted to double
157  auto const r = alleles.size();
158  auto const nd = static_cast<double>( n );
159  if( alleles.size() != phred_scores.size() ) {
160  throw std::invalid_argument(
161  "Invalid alleles and their error probabilities with different size."
162  );
163  }
164 
165  // Process all reads/alleles with their respective error probs
166  for( size_t i = 0; i < r; ++i ) {
167  // We only need the actual error probability here, so instead of allocating a vector for
168  // all of them, we just convert from phread to err prob here on the fly.
169  auto const err_prob = sequence::phred_score_to_error_probability( phred_scores[i] );
170  assert( 0.0 < err_prob && err_prob <= 1.0 );
171 
172  // Equations (1) and (2) from Boitard et al 2012, 10.1093/molbev/mss090,
173  // where we sum over all possible states for n chromosomes, with different
174  // sums for whether we have the derived or ancestral allele.
175  if( alleles[i] ^ invert_alleles ) {
176  for( size_t j = 0; j < n + 1; ++j ) {
177  auto const jn = static_cast<double>( j ) / nd;
178  auto const term1 = ( 1.0 - err_prob ) * jn;
179  auto const term2 = err_prob * ( 1.0 - jn );
180  // result[j] += std::log( term1 + term2 );
181  result[j] *= ( term1 + term2 );
182  }
183  } else {
184  for( size_t j = 0; j < n + 1; ++j ) {
185  auto const jn = static_cast<double>( j ) / nd;
186  auto const term1 = ( 1.0 - err_prob ) * ( 1.0 - jn );
187  auto const term2 = err_prob * jn;
188  // result[j] += std::log( term1 + term2 );
189  result[j] *= ( term1 + term2 );
190  }
191  }
192  }
193 
194  // for( size_t j = 0; j < n + 1; ++j ) {
195  // result[j] = std::exp( result[j] );
196  // }
197  return result;
198 }
199 
200 } // namespace population
201 } // namespace genesis
genesis::sequence::phred_score_to_error_probability
double phred_score_to_error_probability(unsigned char phred_score)
Definition: quality.cpp:518
genesis::population::AfsPileupRecord::phred_scores
std::vector< unsigned char > phred_scores
Definition: afs_estimate.hpp:67
genesis::population::AfsPileupRecord::alleles
std::vector< bool > alleles
Definition: afs_estimate.hpp:66
genesis::population::convert_to_afs_pileup_record
AfsPileupRecord convert_to_afs_pileup_record(SimplePileupReader::Record const &record)
Definition: afs_estimate.cpp:48
genesis::population::prob_cond_true_freq
std::vector< double > prob_cond_true_freq(size_t n, std::vector< bool > const &alleles, std::vector< unsigned char > const &phred_scores, bool unfolded)
Definition: afs_estimate.cpp:121
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:151
genesis::population::SimplePileupReader::Record::reference_base
char reference_base
Definition: simple_pileup_reader.hpp:155
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SimplePileupReader::Record::samples
std::vector< Sample > samples
Definition: simple_pileup_reader.hpp:156
afs_estimate.hpp
genesis::population::AfsPileupRecord
Helper to store the data of one pileup line/record needed for the Boitard et al Allele Frequency Esti...
Definition: afs_estimate.hpp:57
genesis::population::prob_cond_true_freq_unfolded
std::vector< double > prob_cond_true_freq_unfolded(size_t n, std::vector< bool > const &alleles, std::vector< unsigned char > const &phred_scores, bool invert_alleles)
Definition: afs_estimate.cpp:145
quality.hpp