A library for working with phylogenetic and population genetic data.
v0.27.0
population/functions/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <array>
36 #include <cassert>
37 #include <cstring>
38 #include <iostream>
39 #include <stdexcept>
40 #include <utility>
41 
42 namespace genesis {
43 namespace population {
44 
45 // =================================================================================================
46 // Status and Information
47 // =================================================================================================
48 
50  BaseCounts const& sample,
51  size_t min_coverage,
52  size_t max_coverage,
53  size_t min_count,
54  bool tolerate_deletions
55 ) {
56  BaseCountsStatus result;
57  auto const nucleotide_count = nucleotide_sum( sample );
58 
59  // Set the min/max coverage related values.
60  if(
61  nucleotide_count > 0 && nucleotide_count >= min_coverage &&
62  ( max_coverage == 0 || nucleotide_count <= max_coverage )
63  ) {
64  result.is_covered = true;
65 
66  // Sum up the number of different ACGT counts that are present, to determine whether
67  // this is a SNP, and whether it's biallelic. We use bool to int conversion for simplicity,
68  // and to avoid branching. For this, we use the fact that bool converts to 1/0 int.
69  // We have a special case here for min_count == 0, in which case we do
70  // not want to count a 0 as being "above" the min count. That would be riddiculous.
71  static_assert( static_cast<int>( true ) == 1, "Invalid bool(true) to int(1) conversion." );
72  static_assert( static_cast<int>( false ) == 0, "Invalid bool(false) to int(0) conversion." );
73  size_t al_count = 0;
74  al_count += static_cast<int>( sample.a_count > 0 && sample.a_count >= min_count );
75  al_count += static_cast<int>( sample.c_count > 0 && sample.c_count >= min_count );
76  al_count += static_cast<int>( sample.g_count > 0 && sample.g_count >= min_count );
77  al_count += static_cast<int>( sample.t_count > 0 && sample.t_count >= min_count );
78 
79  // Determine type of SNP.
80  if( al_count >= 2 ) {
81  result.is_snp = true;
82  }
83  if( al_count == 2 ) {
84  result.is_biallelic = true;
85  }
86 
87  // Check deletions. We have the same special case as above here.
88  if( sample.d_count > 0 && sample.d_count >= min_count && !tolerate_deletions ) {
89  result.is_covered = false;
90  result.is_snp = false;
91  result.is_biallelic = false;
92  result.is_ignored = true;
93  }
94  }
95 
96  return result;
97 }
98 
99 // =================================================================================================
100 // Counts
101 // =================================================================================================
102 
103 size_t get_base_count( BaseCounts const& bc, char base )
104 {
105  switch( base ) {
106  case 'a':
107  case 'A': {
108  return bc.a_count;
109  }
110  case 'c':
111  case 'C': {
112  return bc.c_count;
113  }
114  case 'g':
115  case 'G': {
116  return bc.g_count;
117  }
118  case 't':
119  case 'T': {
120  return bc.t_count;
121  }
122  case 'n':
123  case 'N': {
124  return bc.n_count;
125  }
126  case 'd':
127  case 'D':
128  case '*':
129  case '.':
130  case '#': {
131  return bc.d_count;
132  }
133  }
134  throw std::runtime_error(
135  "Invalid base character " + utils::char_to_hex( base )
136  );
137 }
138 
140 {
141  return merge( variant.samples );
142 }
143 
144 // =================================================================================================
145 // Sorting
146 // =================================================================================================
147 
161 template<typename T>
162 std::array<size_t, 4> nucleotide_sorting_order_( std::array<T, 4> const& values )
163 {
164  // Sort quickly via sorting network, putting large values first.
165  // See https://stackoverflow.com/a/25070688/4184258
166  auto indices = std::array<size_t, 4>{ 0, 1, 2, 3 };
167  if( values[indices[0]] < values[indices[1]] ) {
168  std::swap( indices[0], indices[1] );
169  }
170  if( values[indices[2]] < values[indices[3]] ) {
171  std::swap( indices[2], indices[3] );
172  }
173  if( values[indices[0]] < values[indices[2]] ) {
174  std::swap( indices[0], indices[2] );
175  }
176  if( values[indices[1]] < values[indices[3]] ) {
177  std::swap( indices[1], indices[3] );
178  }
179  if( values[indices[1]] < values[indices[2]] ) {
180  std::swap( indices[1], indices[2] );
181  }
182 
183  // Now they are sorted, largest ones first.
184  assert( values[indices[0]] >= values[indices[1]] );
185  assert( values[indices[1]] >= values[indices[2]] );
186  assert( values[indices[2]] >= values[indices[3]] );
187 
188  return indices;
189 }
190 
192 {
193  // Sort quickly via sorting network, putting large values first.
194  // See https://stackoverflow.com/a/25070688/4184258
195  // This is the same as above in nucleotide_sorting_order_(), but we here swap directly,
196  // for speed, as a tradeoff against code duplication...
197  SortedBaseCounts result = {
198  'A', sample.a_count,
199  'C', sample.c_count,
200  'G', sample.g_count,
201  'T', sample.t_count
202  };
203  if( result[0].count < result[1].count ) {
204  std::swap( result[0], result[1] );
205  }
206  if( result[2].count < result[3].count ) {
207  std::swap( result[2], result[3] );
208  }
209  if( result[0].count < result[2].count ) {
210  std::swap( result[0], result[2] );
211  }
212  if( result[1].count < result[3].count ) {
213  std::swap( result[1], result[3] );
214  }
215  if( result[1].count < result[2].count ) {
216  std::swap( result[1], result[2] );
217  }
218  return result;
219 }
220 
221 std::pair<SortedBaseCounts, SortedBaseCounts> sorted_average_base_counts(
222  BaseCounts const& sample_a,
223  BaseCounts const& sample_b
224 ) {
225  // Prepare result.
226  auto result = std::pair<SortedBaseCounts, SortedBaseCounts>{};
227 
228  // Get frequencies in sample 1
229  size_t const s1_cnts[] = {
230  sample_a.a_count, sample_a.c_count, sample_a.g_count, sample_a.t_count
231  };
232  double const s1_nt_cnt = static_cast<double>( nucleotide_sum( sample_a )); // eucov
233  double const s1_freqs[] = {
234  static_cast<double>( sample_a.a_count ) / s1_nt_cnt,
235  static_cast<double>( sample_a.c_count ) / s1_nt_cnt,
236  static_cast<double>( sample_a.g_count ) / s1_nt_cnt,
237  static_cast<double>( sample_a.t_count ) / s1_nt_cnt
238  };
239 
240  // Get frequencies in sample 2
241  size_t const s2_cnts[] = {
242  sample_b.a_count, sample_b.c_count, sample_b.g_count, sample_b.t_count
243  };
244  double const s2_nt_cnt = static_cast<double>( nucleotide_sum( sample_b )); // eucov
245  double const s2_freqs[] = {
246  static_cast<double>( sample_b.a_count ) / s2_nt_cnt,
247  static_cast<double>( sample_b.c_count ) / s2_nt_cnt,
248  static_cast<double>( sample_b.g_count ) / s2_nt_cnt,
249  static_cast<double>( sample_b.t_count ) / s2_nt_cnt
250  };
251 
252  // Edge case. If there are no counts at all, we return empty.
253  // The follow up function f_st_asymptotically_unbiased_nkdk() will also catch this edge case,
254  // return zeros as well, and nothing will be added to the total F_ST sum.
255  if( s1_nt_cnt == 0.0 || s2_nt_cnt == 0.0 ) {
256  return result;
257  }
258 
259  // Compute their averages.
260  std::array<double, 4> const avg_freqs = {
261  ( s1_freqs[0] + s2_freqs[0] ) / 2.0,
262  ( s1_freqs[1] + s2_freqs[1] ) / 2.0,
263  ( s1_freqs[2] + s2_freqs[2] ) / 2.0,
264  ( s1_freqs[3] + s2_freqs[3] ) / 2.0
265  };
266 
267  // Get the sorting order, based on the averages.
268  auto const order = nucleotide_sorting_order_( avg_freqs );
269 
270  // Now they are sorted, largest ones first.
271  assert( avg_freqs[order[0]] >= avg_freqs[order[1]] );
272  assert( avg_freqs[order[1]] >= avg_freqs[order[2]] );
273  assert( avg_freqs[order[2]] >= avg_freqs[order[3]] );
274 
275  // Prepare result. We use an array of the nucleotides to get them in the order as needed.
276  static const char nts[] = {'A', 'C', 'G', 'T'};
277  result.first[0] = { nts[order[0]], s1_cnts[order[0]] };
278  result.first[1] = { nts[order[1]], s1_cnts[order[1]] };
279  result.first[2] = { nts[order[2]], s1_cnts[order[2]] };
280  result.first[3] = { nts[order[3]], s1_cnts[order[3]] };
281  result.second[0] = { nts[order[0]], s2_cnts[order[0]] };
282  result.second[1] = { nts[order[1]], s2_cnts[order[1]] };
283  result.second[2] = { nts[order[2]], s2_cnts[order[2]] };
284  result.second[3] = { nts[order[3]], s2_cnts[order[3]] };
285  return result;
286 }
287 
289  Variant const& variant, bool reference_first
290 ) {
291  // We use sorting networks for speed here. See f_st_asymptotically_unbiased_a1n1a2n2()
292  // for details on the technique.
293 
294  auto const total = total_base_counts( variant );
295  if( reference_first ) {
296  SortedBaseCounts result;
297  switch( variant.reference_base ) {
298  case 'a':
299  case 'A': {
300  result[0] = { 'A', total.a_count };
301  result[1] = { 'C', total.c_count };
302  result[2] = { 'G', total.g_count };
303  result[3] = { 'T', total.t_count };
304  break;
305  }
306  case 'c':
307  case 'C': {
308  result[0] = { 'C', total.c_count };
309  result[1] = { 'A', total.a_count };
310  result[2] = { 'G', total.g_count };
311  result[3] = { 'T', total.t_count };
312  break;
313  }
314  case 'g':
315  case 'G': {
316  result[0] = { 'G', total.g_count };
317  result[1] = { 'A', total.a_count };
318  result[2] = { 'C', total.c_count };
319  result[3] = { 'T', total.t_count };
320  break;
321  }
322  case 't':
323  case 'T': {
324  result[0] = { 'T', total.t_count };
325  result[1] = { 'A', total.a_count };
326  result[2] = { 'C', total.c_count };
327  result[3] = { 'G', total.g_count };
328  break;
329  }
330  default: {
331  throw std::runtime_error(
332  "Invalid reference base character " + utils::char_to_hex( variant.reference_base )
333  );
334  }
335  }
336  if( result[1].count < result[2].count ) {
337  std::swap( result[1], result[2] );
338  }
339  if( result[1].count < result[3].count ) {
340  std::swap( result[1], result[3] );
341  }
342  if( result[2].count < result[3].count ) {
343  std::swap( result[2], result[3] );
344  }
345  return result;
346  } else {
347  return sorted_base_counts( total );
348  }
349 }
350 
351 // =================================================================================================
352 // Merging
353 // =================================================================================================
354 
355 void merge_inplace( BaseCounts& p1, BaseCounts const& p2 )
356 {
357  // Make sure that we do not forget any fields in case of later refactoring of the struct.
358  // Nope, can't do, as the compiler might allocate more to get proper memory alignment...
359  // static_assert(
360  // sizeof( BaseCounts ) == 6 * sizeof( size_t ),
361  // "Unexpected number of member variables in BaseCounts class"
362  // );
363 
364  p1.a_count += p2.a_count;
365  p1.c_count += p2.c_count;
366  p1.g_count += p2.g_count;
367  p1.t_count += p2.t_count;
368  p1.n_count += p2.n_count;
369  p1.d_count += p2.d_count;
370 }
371 
372 BaseCounts merge( BaseCounts const& p1, BaseCounts const& p2 )
373 {
374  BaseCounts result = p1;
375  merge_inplace( result, p2 );
376  return result;
377 }
378 
379 BaseCounts merge( std::vector<BaseCounts> const& p )
380 {
381  BaseCounts result;
382  for( auto const& ps : p ) {
383  result.a_count += ps.a_count;
384  result.c_count += ps.c_count;
385  result.g_count += ps.g_count;
386  result.t_count += ps.t_count;
387  result.n_count += ps.n_count;
388  result.d_count += ps.d_count;
389  }
390  return result;
391 }
392 
393 // =================================================================================================
394 // Consensus
395 // =================================================================================================
396 
397 std::pair<char, double> consensus( BaseCounts const& sample )
398 {
399  // Get total count/coverage with nucleotides.
400  auto const nucleotide_count = nucleotide_sum( sample );
401 
402  // Only compute consensus if we have enough coverage. This can only be if we have
403  // at least some counts. Assert that.
404  if( nucleotide_count == 0 ) {
405  return { 'N', 0.0 };
406  }
407  assert( nucleotide_count > 0 );
408  assert( sample.a_count > 0 || sample.c_count > 0 || sample.g_count > 0 || sample.t_count > 0 );
409  assert(
410  sample.a_count + sample.c_count + sample.g_count + sample.t_count == nucleotide_count
411  );
412 
413  // Fast way of comparing four variables and finding the name of the max one.
414  // We don't want any expensive sorting on dynamic memory (vecors or the like),
415  // but just do pairwise comparisons with indices instead. Let's not even use a loop
416  // (loop unrolling), to be even faster. Four int copies and three comparisons. So smart.
417  size_t const counts[] = { sample.a_count, sample.c_count, sample.g_count, sample.t_count };
418  size_t max_idx = 0;
419  if( counts[1] > counts[max_idx] ) {
420  max_idx = 1;
421  }
422  if( counts[2] > counts[max_idx] ) {
423  max_idx = 2;
424  }
425  if( counts[3] > counts[max_idx] ) {
426  max_idx = 3;
427  }
428 
429  // Now use the index to get the consensus character from a static lookup, and the confidence.
430  static const char nts[] = {'A', 'C', 'G', 'T'};
431  auto const conf
432  = static_cast<double>( counts[max_idx] )
433  / static_cast<double>( nucleotide_count )
434  ;
435  return { nts[max_idx], conf };
436 }
437 
438 std::pair<char, double> consensus( BaseCounts const& sample, BaseCountsStatus const& status )
439 {
440  if( status.is_covered ) {
441  return consensus( sample );
442  } else {
443  return { 'N', 0.0 };
444  }
445 }
446 
447 char guess_reference_base( Variant const& variant )
448 {
449  auto const ref = utils::to_upper( variant.reference_base );
450  if( ref == 'A' || ref == 'C' || ref == 'G' || ref == 'T' ) {
451  return ref;
452  } else {
453  auto const sorted = sorted_base_counts( variant, false );
454  if( sorted[0].count > 0 ) {
455  return utils::to_upper( sorted[0].base );
456  }
457  }
458 
459  // Last else case outside, so that compilers always see that this function returns a value.
460  return 'N';
461 }
462 
463 char guess_alternative_base( Variant const& variant, bool force )
464 {
465  auto const alt = utils::to_upper( variant.alternative_base );
466  if( ! force && ( alt == 'A' || alt == 'C' || alt == 'G' || alt == 'T' )) {
467  return alt;
468  } else {
469  auto const ref = utils::to_upper( variant.reference_base );
470  if( ref == 'A' || ref == 'C' || ref == 'G' || ref == 'T' ) {
471  auto const sorted = sorted_base_counts( variant, true );
472  if( sorted[1].count > 0 ) {
473  return utils::to_upper( sorted[1].base );
474  }
475  }
476  }
477 
478  // Else case outside, so that compilers always see that this function returns a value.
479  return 'N';
480 }
481 
482 // =================================================================================================
483 // Output
484 // =================================================================================================
485 
486 std::ostream& operator<<( std::ostream& os, BaseCounts const& bs )
487 {
488  os << "A=" << bs.a_count << ", C=" << bs.c_count << ", G=" << bs.g_count;
489  os << ", T=" << bs.t_count << ", N=" << bs.n_count << ", D=" << bs.d_count;
490  return os;
491 }
492 
493 } // namespace population
494 } // namespace genesis
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
genesis::population::guess_reference_base
char guess_reference_base(Variant const &variant)
Guess the reference base of a Variant.
Definition: population/functions/functions.cpp:447
genesis::population::BaseCountsStatus
Definition: population/functions/functions.hpp:50
genesis::population::BaseCounts::d_count
size_t d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: base_counts.hpp:84
genesis::population::get_base_count
size_t get_base_count(BaseCounts const &bc, char base)
Get the count for a base given as a char.
Definition: population/functions/functions.cpp:103
genesis::population::SortedBaseCounts
Ordered array of base counts for the four nucleotides.
Definition: base_counts.hpp:110
genesis::population::BaseCounts::t_count
size_t t_count
Count of all T nucleotides that are present in the sample.
Definition: base_counts.hpp:74
genesis::population::BaseCountsStatus::is_biallelic
bool is_biallelic
Is the Sample biallelic?
Definition: population/functions/functions.hpp:89
genesis::population::BaseCounts::g_count
size_t g_count
Count of all G nucleotides that are present in the sample.
Definition: base_counts.hpp:69
genesis::population::BaseCounts::a_count
size_t a_count
Count of all A nucleotides that are present in the sample.
Definition: base_counts.hpp:59
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:66
genesis::population::sorted_average_base_counts
std::pair< SortedBaseCounts, SortedBaseCounts > sorted_average_base_counts(BaseCounts const &sample_a, BaseCounts const &sample_b)
Return the sorted base counts of both input samples, orderd by the average frequencies of the nucleot...
Definition: population/functions/functions.cpp:221
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, BaseCounts const &bs)
Output stream operator for BaseCounts instances.
Definition: population/functions/functions.cpp:486
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::consensus
std::pair< char, double > consensus(BaseCounts const &sample)
Consensus character for a BaseCounts, and its confidence.
Definition: population/functions/functions.cpp:397
genesis::population::BaseCounts::n_count
size_t n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: base_counts.hpp:79
genesis::population::BaseCountsStatus::is_snp
bool is_snp
Does the Sample have two or more alleles?
Definition: population/functions/functions.hpp:77
genesis::population::nucleotide_sum
size_t nucleotide_sum(BaseCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/functions/functions.hpp:222
genesis::population::Variant::samples
std::vector< BaseCounts > samples
Definition: variant.hpp:69
genesis::population::merge
BaseCounts merge(BaseCounts const &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss.
Definition: population/functions/functions.cpp:372
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis::population::BaseCounts::c_count
size_t c_count
Count of all C nucleotides that are present in the sample.
Definition: base_counts.hpp:64
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::guess_alternative_base
char guess_alternative_base(Variant const &variant, bool force)
Guess the alternative base of a Variant.
Definition: population/functions/functions.cpp:463
genesis::population::status
BaseCountsStatus status(BaseCounts const &sample, size_t min_coverage, size_t max_coverage, size_t min_count, bool tolerate_deletions)
Compute a simple status with useful properties from the counts of a BaseCounts.
Definition: population/functions/functions.cpp:49
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:54
genesis::population::total_base_counts
BaseCounts total_base_counts(Variant const &variant)
Get the summed up total base counts of a Variant.
Definition: population/functions/functions.cpp:139
char.hpp
genesis::population::sorted_base_counts
SortedBaseCounts sorted_base_counts(BaseCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
Definition: population/functions/functions.cpp:191
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:67
genesis::population::nucleotide_sorting_order_
std::array< size_t, 4 > nucleotide_sorting_order_(std::array< T, 4 > const &values)
Local helper function that runs a sorting network to sort four values, coming from the four nucleotid...
Definition: population/functions/functions.cpp:162
genesis::population::merge_inplace
void merge_inplace(BaseCounts &p1, BaseCounts const &p2)
Merge the counts of two BaseCountss, by adding the counts of the second (p2) to the first (p1).
Definition: population/functions/functions.cpp:355
functions.hpp
genesis::population::BaseCountsStatus::is_ignored
bool is_ignored
Is the Sample ignored due to high deletions count?
Definition: population/functions/functions.hpp:101
genesis::population::BaseCountsStatus::is_covered
bool is_covered
Is the Sample covered by enough reads/nucleotides?
Definition: population/functions/functions.hpp:65