A library for working with phylogenetic data.
v0.25.0
base_counts.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <cassert>
36 #include <iostream>
37 #include <stdexcept>
38 
39 namespace genesis {
40 namespace population {
41 
42 // =================================================================================================
43 // Status and Information
44 // =================================================================================================
45 
47  BaseCounts const& sample,
48  size_t min_coverage,
49  size_t max_coverage,
50  size_t min_count,
51  bool tolerate_deletions
52 ) {
53  BaseCountsStatus result;
54  auto const nucleotide_count = nucleotide_sum( sample );
55 
56  // Set the min/max coverage related values.
57  if(
58  nucleotide_count > 0 && nucleotide_count >= min_coverage &&
59  ( max_coverage == 0 || nucleotide_count <= max_coverage )
60  ) {
61  result.is_covered = true;
62 
63  // Sum up the number of different ACGT counts that are present, to determine whether
64  // this is a SNP, and whether it's biallelic. We use bool to int conversion for simplicity,
65  // and to avoid branching. For this, we use the fact that bool converts to 1/0 int.
66  // We have a special case here for min_count == 0, in which case we do
67  // not want to count a 0 as being "above" the min count. That would be riddiculous.
68  static_assert( static_cast<int>( true ) == 1, "Invalid bool(true) to int(1) conversion." );
69  static_assert( static_cast<int>( false ) == 0, "Invalid bool(false) to int(0) conversion." );
70  size_t al_count = 0;
71  al_count += static_cast<int>( sample.a_count > 0 && sample.a_count >= min_count );
72  al_count += static_cast<int>( sample.c_count > 0 && sample.c_count >= min_count );
73  al_count += static_cast<int>( sample.g_count > 0 && sample.g_count >= min_count );
74  al_count += static_cast<int>( sample.t_count > 0 && sample.t_count >= min_count );
75 
76  // Determine type of SNP.
77  if( al_count >= 2 ) {
78  result.is_snp = true;
79  }
80  if( al_count == 2 ) {
81  result.is_biallelic = true;
82  }
83 
84  // Check deletions. We have the same special case as above here.
85  if( sample.d_count > 0 && sample.d_count >= min_count && !tolerate_deletions ) {
86  result.is_covered = false;
87  result.is_snp = false;
88  result.is_biallelic = false;
89  result.is_ignored = true;
90  }
91  }
92 
93  return result;
94 }
95 
96 size_t get_base_count( BaseCounts const& bc, char base )
97 {
98  switch( base ) {
99  case 'a':
100  case 'A': {
101  return bc.a_count;
102  }
103  case 'c':
104  case 'C': {
105  return bc.c_count;
106  }
107  case 'g':
108  case 'G': {
109  return bc.g_count;
110  }
111  case 't':
112  case 'T': {
113  return bc.t_count;
114  }
115  case 'n':
116  case 'N': {
117  return bc.n_count;
118  }
119  case 'd':
120  case 'D':
121  case '*':
122  case '.':
123  case '#': {
124  return bc.d_count;
125  }
126  }
127  throw std::runtime_error(
128  "Invalid base character " + utils::char_to_hex( base )
129  );
130 }
131 
132 // =================================================================================================
133 // Accumulation, Filtering, etc
134 // =================================================================================================
135 
136 void merge_inplace( BaseCounts& p1, BaseCounts const& p2 )
137 {
138  // Make sure that we do not forget any fields in case of later refactoring of the struct.
139  // Nope, can't do, as the compiler might allocate more to get proper memory alignment...
140  // static_assert(
141  // sizeof( BaseCounts ) == 6 * sizeof( size_t ),
142  // "Unexpected number of member variables in BaseCounts class"
143  // );
144 
145  p1.a_count += p2.a_count;
146  p1.c_count += p2.c_count;
147  p1.g_count += p2.g_count;
148  p1.t_count += p2.t_count;
149  p1.n_count += p2.n_count;
150  p1.d_count += p2.d_count;
151 }
152 
153 BaseCounts merge( BaseCounts const& p1, BaseCounts const& p2 )
154 {
155  BaseCounts result = p1;
156  merge_inplace( result, p2 );
157  return result;
158 }
159 
160 BaseCounts merge( std::vector<BaseCounts> const& p )
161 {
162  BaseCounts result;
163  for( auto const& ps : p ) {
164  result.a_count += ps.a_count;
165  result.c_count += ps.c_count;
166  result.g_count += ps.g_count;
167  result.t_count += ps.t_count;
168  result.n_count += ps.n_count;
169  result.d_count += ps.d_count;
170  }
171  return result;
172 }
173 
174 void filter_min_count( BaseCounts& sample, size_t min_count )
175 {
176  // Reset counts if needed, according to min count setting.
177  if( sample.a_count < min_count ) {
178  sample.a_count = 0;
179  }
180  if( sample.c_count < min_count ) {
181  sample.c_count = 0;
182  }
183  if( sample.g_count < min_count ) {
184  sample.g_count = 0;
185  }
186  if( sample.t_count < min_count ) {
187  sample.t_count = 0;
188  }
189 }
190 
191 std::pair<char, double> consensus( BaseCounts const& sample )
192 {
193  // Get total count/coverage with nucleotides.
194  auto const nucleotide_count = nucleotide_sum( sample );
195 
196  // Only compute consensus if we have enough coverage. This can only be if we have
197  // at least some counts. Assert that.
198  if( nucleotide_count == 0 ) {
199  return { 'N', 0.0 };
200  }
201  assert( nucleotide_count > 0 );
202  assert( sample.a_count > 0 || sample.c_count > 0 || sample.g_count > 0 || sample.t_count > 0 );
203  assert(
204  sample.a_count + sample.c_count + sample.g_count + sample.t_count == nucleotide_count
205  );
206 
207  // Fast way of comparing four variables and finding the name of the max one.
208  // We don't want any expensive sorting on dynamic memory (vecors or the like),
209  // but just do pairwise comparisons with indices instead. Let's not even use a loop
210  // (loop unrolling), to be even faster. Four int copies and three comparisons. So smart.
211  size_t const counts[] = { sample.a_count, sample.c_count, sample.g_count, sample.t_count };
212  size_t max_idx = 0;
213  if( counts[1] > counts[max_idx] ) {
214  max_idx = 1;
215  }
216  if( counts[2] > counts[max_idx] ) {
217  max_idx = 2;
218  }
219  if( counts[3] > counts[max_idx] ) {
220  max_idx = 3;
221  }
222 
223  // Now use the index to get the consensus character from a static lookup, and the confidence.
224  static const char nts[] = {'A', 'C', 'G', 'T'};
225  auto const conf
226  = static_cast<double>( counts[max_idx] )
227  / static_cast<double>( nucleotide_count )
228  ;
229  return { nts[max_idx], conf };
230 }
231 
232 std::pair<char, double> consensus( BaseCounts const& sample, BaseCountsStatus const& status )
233 {
234  if( status.is_covered ) {
235  return consensus( sample );
236  } else {
237  return { 'N', 0.0 };
238  }
239 }
240 
241 // =================================================================================================
242 // Conversion Functions
243 // =================================================================================================
244 
246  SimplePileupReader::Sample const& sample,
247  unsigned char min_phred_score
248 ) {
249  BaseCounts result;
250 
251  // Tally up the bases.
252  size_t total_count = 0;
253  size_t skip_count = 0;
254  size_t rna_count = 0;
255  for( size_t i = 0; i < sample.read_bases.size(); ++i ) {
256 
257  // Quality control if available. Skip bases that are below the threshold.
258  if( !sample.phred_scores.empty() && sample.phred_scores[i] < min_phred_score ) {
259  ++skip_count;
260  continue;
261  }
262 
263  ++total_count;
264  switch( sample.read_bases[i] ) {
265  case 'a':
266  case 'A': {
267  ++result.a_count;
268  break;
269  }
270  case 'c':
271  case 'C': {
272  ++result.c_count;
273  break;
274  }
275  case 'g':
276  case 'G': {
277  ++result.g_count;
278  break;
279  }
280  case 't':
281  case 'T': {
282  ++result.t_count;
283  break;
284  }
285  case 'n':
286  case 'N': {
287  ++result.n_count;
288  break;
289  }
290  case '*':
291  case '#': {
292  ++result.d_count;
293  break;
294  }
295  case '<':
296  case '>': {
297  // Skipping RNA symbols. But count them, for sanity check.
298  (void) rna_count;
299  ++rna_count;
300  break;
301  }
302  default: {
303  throw std::runtime_error(
304  "Malformed pileup sample: Invalid allele character " +
305  utils::char_to_hex( sample.read_bases[i] )
306  );
307  }
308  }
309  }
310 
311  // Sanity checks and assertions.
312  (void) total_count;
313  assert(
314  total_count ==
315  result.a_count + result.c_count + result.g_count + result.t_count +
316  result.n_count + result.d_count + rna_count
317  );
318  assert( skip_count + total_count == sample.read_bases.size() );
319 
320  // Sum sanity checks. There seems to be a very weird special case (found in the PoPoolation2
321  // test dataset) where a line contains a deletion with a low phred score (`*`) that is not
322  // counted in the "Number of reads covering this position" counter:
323  // ` 89795 2R 113608 N 1 T$ A 0 * *`
324  // We account for this here by allowing exactly one such base that is either a deletion
325  // or a skip due to low phred score. There is no information that we know of about how
326  // "empty" lines should be treated in pileip, so we have to guess, and that here seems to work.
327  auto const base_count =
328  result.a_count + result.c_count + result.g_count + result.t_count + result.n_count
329  ;
330  if(
331  sample.read_bases.size() != sample.read_coverage &&
332  !( base_count == 0 && result.d_count + skip_count == 1 )
333  ) {
334 
335  throw std::runtime_error(
336  "Malformed pileup sample: Given read count (" + std::to_string( sample.read_coverage ) +
337  ") does not match the number of bases found in the sample (" +
338  std::to_string( sample.read_bases.size() ) + ")"
339  );
340  }
341 
342  return result;
343 }
344 
345 std::ostream& operator<<( std::ostream& os, BaseCounts const& bs )
346 {
347  os << "A=" << bs.a_count << ", C=" << bs.c_count << ", G=" << bs.g_count;
348  os << ", T=" << bs.t_count << ", N=" << bs.n_count << ", D=" << bs.d_count;
349  return os;
350 }
351 
352 std::ostream& to_sync( BaseCounts const& bs, std::ostream& os )
353 {
354  os << bs.a_count << ":" << bs.t_count << ":" << bs.c_count << ":" << bs.g_count;
355  os << ":" << bs.n_count << ":" << bs.d_count;
356  return os;
357 }
358 
359 } // namespace population
360 } // namespace genesis
genesis::population::BaseCountsStatus
Definition: functions/base_counts.hpp:49
genesis::population::BaseCounts::d_count
size_t d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: base_counts.hpp:83
genesis::population::get_base_count
size_t get_base_count(BaseCounts const &bc, char base)
Get the count for a base given as a char.
Definition: base_counts.cpp:96
genesis::population::BaseCounts::t_count
size_t t_count
Count of all T nucleotides that are present in the sample.
Definition: base_counts.hpp:73
genesis::population::filter_min_count
void filter_min_count(BaseCounts &sample, size_t min_count)
Filter by minimum count that we need for a type of nucleotide (A, C, G, T) to be considered; set to z...
Definition: base_counts.cpp:174
genesis::population::BaseCountsStatus::is_biallelic
bool is_biallelic
Is the Sample biallelic?
Definition: functions/base_counts.hpp:88
genesis::population::BaseCounts::g_count
size_t g_count
Count of all G nucleotides that are present in the sample.
Definition: base_counts.hpp:68
genesis::population::BaseCounts::a_count
size_t a_count
Count of all A nucleotides that are present in the sample.
Definition: base_counts.hpp:58
genesis::population::SimplePileupReader::Sample::read_coverage
size_t read_coverage
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:106
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, BaseCounts const &bs)
Output stream operator for BaseCounts instances.
Definition: base_counts.cpp:345
genesis::population::consensus
std::pair< char, double > consensus(BaseCounts const &sample)
Consensus character for a BaseCounts, and its confidence.
Definition: base_counts.cpp:191
genesis::population::BaseCounts::n_count
size_t n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: base_counts.hpp:78
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:92
genesis::population::BaseCountsStatus::is_snp
bool is_snp
Does the Sample have two or more alleles?
Definition: functions/base_counts.hpp:76
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: functions/base_counts.hpp:177
genesis::population::convert_to_base_counts
BaseCounts convert_to_base_counts(SimplePileupReader::Sample const &sample, unsigned char min_phred_score)
Definition: base_counts.cpp:245
genesis::population::merge
BaseCounts merge(BaseCounts const &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss.
Definition: base_counts.cpp:153
genesis::population::BaseCounts::c_count
size_t c_count
Count of all C nucleotides that are present in the sample.
Definition: base_counts.hpp:63
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
base_counts.hpp
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:115
genesis::population::status
BaseCountsStatus status(BaseCounts const &sample, size_t min_coverage, size_t max_coverage, size_t min_count, bool tolerate_deletions)
Compute a simple status with useful properties from the counts of a BaseCounts.
Definition: base_counts.cpp:46
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:53
char.hpp
genesis::population::to_sync
std::ostream & to_sync(BaseCounts const &bs, std::ostream &os)
Output a BaseCounts instance to a stream in the PoPoolation2 sync format.
Definition: base_counts.cpp:352
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
genesis::population::merge_inplace
void merge_inplace(BaseCounts &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss, by adding the counts of the second (p2) to the first (p1).
Definition: base_counts.cpp:136
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:124
genesis::population::BaseCountsStatus::is_ignored
bool is_ignored
Is the Sample ignored due to high deletions count?
Definition: functions/base_counts.hpp:100
genesis::population::BaseCountsStatus::is_covered
bool is_covered
Is the Sample covered by enough reads/nucleotides?
Definition: functions/base_counts.hpp:64