A library for working with phylogenetic and population genetic data.
v0.32.0
population/function/functions.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <array>
37 #include <cassert>
38 #include <cstring>
39 #include <iostream>
40 #include <stdexcept>
41 #include <utility>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Counts
48 // =================================================================================================
49 
51 {
52  switch( base ) {
53  case 'a':
54  case 'A': {
55  return sample.a_count;
56  }
57  case 'c':
58  case 'C': {
59  return sample.c_count;
60  }
61  case 'g':
62  case 'G': {
63  return sample.g_count;
64  }
65  case 't':
66  case 'T': {
67  return sample.t_count;
68  }
69  case 'n':
70  case 'N': {
71  return sample.n_count;
72  }
73  case 'd':
74  case 'D':
75  case '*':
76  case '.':
77  case '#': {
78  return sample.d_count;
79  }
80  }
81  throw std::runtime_error(
82  "Invalid base character " + utils::char_to_hex( base )
83  );
84 }
85 
86 void set_base_count( SampleCounts& sample, char base, SampleCounts::size_type value )
87 {
88  switch( base ) {
89  case 'a':
90  case 'A': {
91  sample.a_count = value;
92  break;
93  }
94  case 'c':
95  case 'C': {
96  sample.c_count = value;
97  break;
98  }
99  case 'g':
100  case 'G': {
101  sample.g_count = value;
102  break;
103  }
104  case 't':
105  case 'T': {
106  sample.t_count = value;
107  break;
108  }
109  case 'n':
110  case 'N': {
111  sample.n_count = value;
112  break;
113  }
114  case 'd':
115  case 'D':
116  case '*':
117  case '.':
118  case '#': {
119  sample.d_count = value;
120  break;
121  }
122  default: {
123  throw std::runtime_error(
124  "Invalid base character " + utils::char_to_hex( base )
125  );
126  }
127  }
128 }
129 
130 // =================================================================================================
131 // Sorting
132 // =================================================================================================
133 
135 {
136  // Sort quickly via sorting network, putting large values first.
137  // See https://stackoverflow.com/a/25070688/4184258
138  // This is the same as in nucleotide_sorting_order(), but we here swap directly,
139  // for speed, as a tradeoff against code duplication...
140  SortedSampleCounts result = {
141  'A', sample.a_count,
142  'C', sample.c_count,
143  'G', sample.g_count,
144  'T', sample.t_count
145  };
146  if( result[0].count < result[1].count ) {
147  std::swap( result[0], result[1] );
148  }
149  if( result[2].count < result[3].count ) {
150  std::swap( result[2], result[3] );
151  }
152  if( result[0].count < result[2].count ) {
153  std::swap( result[0], result[2] );
154  }
155  if( result[1].count < result[3].count ) {
156  std::swap( result[1], result[3] );
157  }
158  if( result[1].count < result[2].count ) {
159  std::swap( result[1], result[2] );
160  }
161  return result;
162 }
163 
164 std::pair<SortedSampleCounts, SortedSampleCounts> sorted_average_sample_counts(
165  SampleCounts const& sample_a,
166  SampleCounts const& sample_b
167 ) {
168  // Prepare result.
169  auto result = std::pair<SortedSampleCounts, SortedSampleCounts>{};
170 
171  // Get frequencies in sample 1
172  size_t const s1_cnts[] = {
173  sample_a.a_count, sample_a.c_count, sample_a.g_count, sample_a.t_count
174  };
175  double const s1_nt_cnt = static_cast<double>( nucleotide_sum( sample_a )); // eucov
176  double const s1_freqs[] = {
177  static_cast<double>( sample_a.a_count ) / s1_nt_cnt,
178  static_cast<double>( sample_a.c_count ) / s1_nt_cnt,
179  static_cast<double>( sample_a.g_count ) / s1_nt_cnt,
180  static_cast<double>( sample_a.t_count ) / s1_nt_cnt
181  };
182 
183  // Get frequencies in sample 2
184  size_t const s2_cnts[] = {
185  sample_b.a_count, sample_b.c_count, sample_b.g_count, sample_b.t_count
186  };
187  double const s2_nt_cnt = static_cast<double>( nucleotide_sum( sample_b )); // eucov
188  double const s2_freqs[] = {
189  static_cast<double>( sample_b.a_count ) / s2_nt_cnt,
190  static_cast<double>( sample_b.c_count ) / s2_nt_cnt,
191  static_cast<double>( sample_b.g_count ) / s2_nt_cnt,
192  static_cast<double>( sample_b.t_count ) / s2_nt_cnt
193  };
194 
195  // Edge case. If there are no counts at all, we return empty.
196  // The follow up function f_st_asymptotically_unbiased_nkdk() will also catch this edge case,
197  // return zeros as well, and nothing will be added to the total F_ST sum.
198  if( s1_nt_cnt == 0.0 || s2_nt_cnt == 0.0 ) {
199  return result;
200  }
201 
202  // Compute their averages.
203  std::array<double, 4> const avg_freqs = {{
204  ( s1_freqs[0] + s2_freqs[0] ) / 2.0,
205  ( s1_freqs[1] + s2_freqs[1] ) / 2.0,
206  ( s1_freqs[2] + s2_freqs[2] ) / 2.0,
207  ( s1_freqs[3] + s2_freqs[3] ) / 2.0
208  }};
209 
210  // Get the sorting order, based on the averages.
211  auto const order = nucleotide_sorting_order( avg_freqs );
212 
213  // Now they are sorted, largest ones first.
214  assert( avg_freqs[order[0]] >= avg_freqs[order[1]] );
215  assert( avg_freqs[order[1]] >= avg_freqs[order[2]] );
216  assert( avg_freqs[order[2]] >= avg_freqs[order[3]] );
217 
218  // Prepare result. We use an array of the nucleotides to get them in the order as needed.
219  static const char nts[] = {'A', 'C', 'G', 'T'};
220  result.first[0] = { nts[order[0]], s1_cnts[order[0]] };
221  result.first[1] = { nts[order[1]], s1_cnts[order[1]] };
222  result.first[2] = { nts[order[2]], s1_cnts[order[2]] };
223  result.first[3] = { nts[order[3]], s1_cnts[order[3]] };
224  result.second[0] = { nts[order[0]], s2_cnts[order[0]] };
225  result.second[1] = { nts[order[1]], s2_cnts[order[1]] };
226  result.second[2] = { nts[order[2]], s2_cnts[order[2]] };
227  result.second[3] = { nts[order[3]], s2_cnts[order[3]] };
228  return result;
229 }
230 
236  Variant const& variant, bool reference_first, SampleCounts const& total
237 ) {
238  // We use sorting networks for speed here. See f_st_asymptotically_unbiased_a1n1a2n2()
239  // for details on the technique.
240 
241  if( reference_first ) {
242  SortedSampleCounts result;
243  switch( variant.reference_base ) {
244  case 'a':
245  case 'A': {
246  result[0] = { 'A', total.a_count };
247  result[1] = { 'C', total.c_count };
248  result[2] = { 'G', total.g_count };
249  result[3] = { 'T', total.t_count };
250  break;
251  }
252  case 'c':
253  case 'C': {
254  result[0] = { 'C', total.c_count };
255  result[1] = { 'A', total.a_count };
256  result[2] = { 'G', total.g_count };
257  result[3] = { 'T', total.t_count };
258  break;
259  }
260  case 'g':
261  case 'G': {
262  result[0] = { 'G', total.g_count };
263  result[1] = { 'A', total.a_count };
264  result[2] = { 'C', total.c_count };
265  result[3] = { 'T', total.t_count };
266  break;
267  }
268  case 't':
269  case 'T': {
270  result[0] = { 'T', total.t_count };
271  result[1] = { 'A', total.a_count };
272  result[2] = { 'C', total.c_count };
273  result[3] = { 'G', total.g_count };
274  break;
275  }
276  default: {
277  throw std::runtime_error(
278  "Cannot use reference base " + utils::char_to_hex( variant.reference_base ) +
279  "to sort base counts."
280  );
281  }
282  }
283  if( result[1].count < result[2].count ) {
284  std::swap( result[1], result[2] );
285  }
286  if( result[1].count < result[3].count ) {
287  std::swap( result[1], result[3] );
288  }
289  if( result[2].count < result[3].count ) {
290  std::swap( result[2], result[3] );
291  }
292  return result;
293  } else {
294  return sorted_sample_counts( total );
295  }
296 }
297 
299  Variant const& variant, bool reference_first, SampleCountsFilterPolicy filter_policy
300 ) {
301  auto const total = merge_sample_counts( variant, filter_policy );
302  return sorted_sample_counts_( variant, reference_first, total );
303 }
304 
305 // =================================================================================================
306 // Allele Count
307 // =================================================================================================
308 
309 size_t allele_count( SampleCounts const& sample )
310 {
311  // We use bool to int conversion for simplicity, and to avoid branching.
312  // For this, we use the fact that bool converts to 1/0 int.
313  static_assert( static_cast<int>( true ) == 1, "Invalid bool(true) to int(1) conversion." );
314  static_assert( static_cast<int>( false ) == 0, "Invalid bool(false) to int(0) conversion." );
315 
316  // Sum up the number of different ACGT counts that are present, to determine whether
317  // this is a SNP, and whether it's biallelic.
318  size_t al_count = 0;
319  al_count += static_cast<int>( sample.a_count > 0 );
320  al_count += static_cast<int>( sample.c_count > 0 );
321  al_count += static_cast<int>( sample.g_count > 0 );
322  al_count += static_cast<int>( sample.t_count > 0 );
323 
324  // Check and return the result.
325  assert( al_count <= 4 );
326  return al_count;
327 }
328 
329 size_t allele_count( SampleCounts const& sample, size_t min_count )
330 {
331  // We need to separate out the min_count == 0 case,
332  // as we do not want to count alelles that are exactly 0...
333  // We do this by setting min_count to 1, so that only true alelles are counted.
334  if( min_count == 0 ) {
335  min_count = 1;
336  }
337 
338  // Sum up the number of different ACGT counts that are at least the min count,
339  // to determine whether this is a SNP, and whether it's biallelic.
340  size_t al_count = 0;
341  al_count += static_cast<int>( sample.a_count >= min_count );
342  al_count += static_cast<int>( sample.c_count >= min_count );
343  al_count += static_cast<int>( sample.g_count >= min_count );
344  al_count += static_cast<int>( sample.t_count >= min_count );
345 
346  // Check and return the result.
347  assert( al_count <= 4 );
348  return al_count;
349 }
350 
351 size_t allele_count( SampleCounts const& sample, size_t min_count, size_t max_count )
352 {
353  // Edge cases checks.
354  if( max_count == 0 ) {
355  return allele_count( sample, min_count );
356  }
357  if( max_count < min_count ) {
358  throw std::invalid_argument(
359  "Cannot compute allele_count() with max_count < min_count. "
360  "min_count == " + std::to_string( min_count ) + ", "
361  "max_count == " + std::to_string( max_count )
362  );
363  }
364 
365  // Same edge case as above.
366  if( min_count == 0 ) {
367  min_count = 1;
368  }
369 
370  // Similar to the other functions above.
371  size_t al_count = 0;
372  al_count += static_cast<int>( sample.a_count >= min_count && sample.a_count <= max_count );
373  al_count += static_cast<int>( sample.c_count >= min_count && sample.c_count <= max_count );
374  al_count += static_cast<int>( sample.g_count >= min_count && sample.g_count <= max_count );
375  al_count += static_cast<int>( sample.t_count >= min_count && sample.t_count <= max_count );
376  return al_count;
377 }
378 
379 // =================================================================================================
380 // Merging
381 // =================================================================================================
382 
384 {
385  // Make sure that we do not forget any fields in case of later refactoring of the struct.
386  // Nope, can't do, as the compiler might allocate more to get proper memory alignment...
387  // static_assert(
388  // sizeof( SampleCounts ) == 6 * sizeof( size_t ),
389  // "Unexpected number of member variables in SampleCounts class"
390  // );
391 
392  p1.a_count += p2.a_count;
393  p1.c_count += p2.c_count;
394  p1.g_count += p2.g_count;
395  p1.t_count += p2.t_count;
396  p1.n_count += p2.n_count;
397  p1.d_count += p2.d_count;
398 }
399 
401 {
402  SampleCounts result = p1;
403  merge_inplace( result, p2 );
404  return result;
405 }
406 
407 SampleCounts merge( std::vector<SampleCounts> const& p, SampleCountsFilterPolicy filter_policy )
408 {
409  SampleCounts result;
410  for( auto const& ps : p ) {
411  if( filter_policy == SampleCountsFilterPolicy::kOnlyPassing && ! ps.status.passing() ) {
412  continue;
413  }
414  result.a_count += ps.a_count;
415  result.c_count += ps.c_count;
416  result.g_count += ps.g_count;
417  result.t_count += ps.t_count;
418  result.n_count += ps.n_count;
419  result.d_count += ps.d_count;
420  }
421  return result;
422 }
423 
424 // =================================================================================================
425 // Consensus
426 // =================================================================================================
427 
428 std::pair<char, double> consensus( SampleCounts const& sample )
429 {
430  // Get total count/read depth with nucleotides.
431  auto const nucleotide_count = nucleotide_sum( sample );
432 
433  // Only compute consensus if we have enough read depth. This can only be if we have
434  // at least some counts. Assert that.
435  if( nucleotide_count == 0 ) {
436  return { 'N', 0.0 };
437  }
438  assert( nucleotide_count > 0 );
439  assert( sample.a_count > 0 || sample.c_count > 0 || sample.g_count > 0 || sample.t_count > 0 );
440  assert(
441  sample.a_count + sample.c_count + sample.g_count + sample.t_count == nucleotide_count
442  );
443 
444  // Fast way of comparing four variables and finding the name of the max one.
445  // We don't want any expensive sorting on dynamic memory (vecors or the like),
446  // but just do pairwise comparisons with indices instead. Let's not even use a loop
447  // (loop unrolling), to be even faster. Four int copies and three comparisons. So smart.
448  size_t const counts[] = { sample.a_count, sample.c_count, sample.g_count, sample.t_count };
449  size_t max_idx = 0;
450  if( counts[1] > counts[max_idx] ) {
451  max_idx = 1;
452  }
453  if( counts[2] > counts[max_idx] ) {
454  max_idx = 2;
455  }
456  if( counts[3] > counts[max_idx] ) {
457  max_idx = 3;
458  }
459 
460  // Now use the index to get the consensus character from a static lookup, and the confidence.
461  static const char nts[] = {'A', 'C', 'G', 'T'};
462  auto const conf
463  = static_cast<double>( counts[max_idx] )
464  / static_cast<double>( nucleotide_count )
465  ;
466  return { nts[max_idx], conf };
467 }
468 
469 std::pair<char, double> consensus( SampleCounts const& sample, bool is_covered )
470 {
471  if( is_covered ) {
472  return consensus( sample );
473  } else {
474  return { 'N', 0.0 };
475  }
476 }
477 
479  Variant const& variant, bool force, SampleCountsFilterPolicy filter_policy
480 ) {
481  auto const ref = utils::to_upper( variant.reference_base );
482  if( ! force && is_valid_base( ref )) {
483  return ref;
484  } else {
485  auto const sorted = sorted_sample_counts( variant, false, filter_policy );
486  if( sorted[0].count > 0 ) {
487  return utils::to_upper( sorted[0].base );
488  }
489  }
490 
491  // Last else case outside, so that compilers always see that this function returns a value.
492  return 'N';
493 }
494 
496  Variant const& variant, bool force, SampleCountsFilterPolicy filter_policy
497 ) {
498  auto const alt = utils::to_upper( variant.alternative_base );
499  if( ! force && is_valid_base( alt )) {
500  return alt;
501  } else {
502  auto const ref = utils::to_upper( variant.reference_base );
503  if( is_valid_base( ref )) {
504  auto const sorted = sorted_sample_counts( variant, true, filter_policy );
505  if( sorted[1].count > 0 ) {
506  return utils::to_upper( sorted[1].base );
507  }
508  }
509  }
510 
511  // Else case outside, so that compilers always see that this function returns a value.
512  return 'N';
513 }
514 
516  Variant& variant, bool force, SampleCountsFilterPolicy filter_policy
517 ) {
518  // Get base data.
519  auto ref = utils::to_upper( variant.reference_base );
520  auto const alt = utils::to_upper( variant.alternative_base );
521 
522  // We only want to compute the total counts if necessary.
523  SampleCounts total;
524  bool computed_total = false;
525 
526  // Set the reference, unless it is already a good value (and we don't force it).
527  // We don't flip the condition here, to keept it consistent with the above functions.
528  if( ! force && is_valid_base( ref )) {
529  // Nothing to do.
530  } else {
531  variant.reference_base = 'N';
532 
533  // Now we need the total base counts.
534  total = merge_sample_counts( variant, filter_policy );
535  computed_total = true;
536 
537  // Use them to define our ref base.
538  auto const sorted = sorted_sample_counts( total );
539  if( sorted[0].count > 0 ) {
540  // Update the ref base. Also update our internal `ref` here, as we need it below.
541  ref = utils::to_upper( sorted[0].base );
542  variant.reference_base = ref;
543  }
544  }
545 
546  // Set the alternative.
547  if( ! force && is_valid_base( alt )) {
548  // Nothing to do.
549  } else {
550  variant.alternative_base = 'N';
551  if( is_valid_base( ref )) {
552  // Only compute the total if needed.
553  if( ! computed_total ) {
554  total = merge_sample_counts( variant, filter_policy );
555  }
556 
557  // Use it to define our alt base.
558  auto const sorted = sorted_sample_counts_( variant, true, total );
559  if( sorted[1].count > 0 ) {
560  variant.alternative_base = utils::to_upper( sorted[1].base );
561  }
562  }
563  }
564 }
565 
567  Variant& variant,
568  char ref_base,
569  bool force,
570  SampleCountsFilterPolicy filter_policy
571 ) {
572  // Shouldn't happen from our parsing etc, but better safe than sorry.
573  if( variant.position == 0 ) {
574  throw std::runtime_error( "Invalid position 0 in Variant." );
575  }
576  ref_base = utils::to_upper( ref_base );
577 
578  // Now use that reference base. If it is in ACGT, we use it as ref; if not, we check against
579  // ambiguity codes to see if it fits with our count-based ref and alt bases instead.
580  if( is_valid_base( ref_base )) {
581  auto const var_ref_base = utils::to_upper( variant.reference_base );
582  if( var_ref_base != 'N' && var_ref_base != ref_base ) {
583  throw std::runtime_error(
584  "At chromosome \"" + variant.chromosome + "\" position " +
585  std::to_string( variant.position ) + ", the Reference Genome has base '" +
586  std::string( 1, ref_base ) + "', while the Variant already has mismatching base '" +
587  std::string( 1, var_ref_base ) + "' set"
588  );
589  }
590 
591  // Now set the base, and obtain the alternative via our normal counting method.
592  variant.reference_base = ref_base;
593  variant.alternative_base = guess_alternative_base( variant, force, filter_policy );
594 
595  } else {
596  // No usable ref base. Run the normal guessing.
597  guess_and_set_ref_and_alt_bases( variant, force, filter_policy );
598  auto const var_ref_base = utils::to_upper( variant.reference_base );
599  auto const var_alt_base = utils::to_upper( variant.alternative_base );
600 
601  // Now we cross check that the ref genome base is a valid base,
602  // and also that it is an ambiguity char that contains either the ref or alt that we found.
603  // If not, something is likely off...
604  // This might be too rigurous though - will have to see in practice, and might change later.
605  bool contains = false;
606  try {
608  contains |= nucleic_acid_code_containment( ref_base, var_ref_base );
609  contains |= nucleic_acid_code_containment( ref_base, var_alt_base );
610  } catch(...) {
611  // The above throws an error if the given bases are not valid.
612  // Catch this, and re-throw a nicer, more understandable exception instead.
613  throw std::runtime_error(
614  "At chromosome \"" + variant.chromosome + "\" position " +
615  std::to_string( variant.position ) + ", the Reference Genome has base '" +
616  std::string( 1, ref_base ) + "', which is not a valid nucleic acid code"
617  );
618  }
619  if( ! contains ) {
620  throw std::runtime_error(
621  "At chromosome \"" + variant.chromosome + "\" position " +
622  std::to_string( variant.position ) + ", the reference base is '" +
623  std::string( 1, var_ref_base ) + "' and the alternative base is '" +
624  std::string( 1, var_alt_base ) +
625  "', determined from nucleotide counts in the data at this position. " +
626  "However, the Reference Genome has base '" + std::string( 1, ref_base ) +
627  "', which does not code for either of them, " +
628  "and hence likely points to some kind of mismatch"
629  );
630  }
631  }
632 }
633 
635  Variant& variant,
636  genesis::sequence::ReferenceGenome const& ref_genome,
637  bool force,
638  SampleCountsFilterPolicy filter_policy
639 ) {
640  // Get the reference base. Throws if seq or base not present.
641  auto const ref_base = ref_genome.get_base( variant.chromosome, variant.position );
642  return guess_and_set_ref_and_alt_bases( variant, ref_base, force, filter_policy );
643 }
644 
645 // =================================================================================================
646 // Output
647 // =================================================================================================
648 
649 std::ostream& operator<<( std::ostream& os, SampleCounts const& bs )
650 {
651  os << "A=" << bs.a_count << ", C=" << bs.c_count << ", G=" << bs.g_count;
652  os << ", T=" << bs.t_count << ", N=" << bs.n_count << ", D=" << bs.d_count;
653  return os;
654 }
655 
656 } // namespace population
657 } // namespace genesis
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
genesis::sequence::ReferenceGenome::get_base
char get_base(std::string const &chromosome, size_t position, bool to_upper=true) const
Get a particular base at a given chromosome and position.
Definition: reference_genome.hpp:174
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, SampleCounts const &bs)
Output stream operator for SampleCounts instances.
Definition: population/function/functions.cpp:649
genesis::population::merge_inplace
void merge_inplace(SampleCounts &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss, by adding the counts of the second (p2) to the first (p1).
Definition: population/function/functions.cpp:383
functions.hpp
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::sequence::nucleic_acid_code_containment
bool nucleic_acid_code_containment(char a, char b, bool undetermined_matches_all)
Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters i...
Definition: codes.cpp:596
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
genesis::population::Variant::position
size_t position
Definition: variant.hpp:74
genesis::population::set_base_count
void set_base_count(SampleCounts &sample, char base, SampleCounts::size_type value)
Set the count for a base given as a char.
Definition: population/function/functions.cpp:86
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::allele_count
size_t allele_count(SampleCounts const &sample)
Return the number of alleles, that is, of non-zero nucleotide counts of the sample.
Definition: population/function/functions.cpp:309
genesis::population::guess_alternative_base
char guess_alternative_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the alternative base of a Variant.
Definition: population/function/functions.cpp:495
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:79
genesis::population::sorted_sample_counts_
SortedSampleCounts sorted_sample_counts_(Variant const &variant, bool reference_first, SampleCounts const &total)
Local helper function that takes an already computed total from merge_sample_counts(),...
Definition: population/function/functions.cpp:235
genesis::population::get_base_count
SampleCounts::size_type get_base_count(SampleCounts const &sample, char base)
Get the count for a base given as a char.
Definition: population/function/functions.cpp:50
genesis::population::is_covered
bool is_covered(GenomeRegion const &region, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
Definition: genome_region.cpp:207
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::population::SampleCountsFilterPolicy
SampleCountsFilterPolicy
Policy helper to decide how to treat filtered SampleCounts.
Definition: sample_counts_filter.hpp:211
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
genesis::population::SortedSampleCounts
Ordered array of sample counts for the four nucleotides.
Definition: sample_counts.hpp:110
genesis::population::SampleCounts::n_count
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: sample_counts.hpp:86
genesis::population::merge_sample_counts
SampleCounts merge_sample_counts(Variant const &v, SampleCountsFilterPolicy filter_policy)
Merge the counts of a vector SampleCountss.
Definition: population/function/functions.hpp:282
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis::population::is_valid_base
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
Definition: population/function/functions.hpp:56
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SampleCountsFilterPolicy::kOnlyPassing
@ kOnlyPassing
char.hpp
genesis::population::nucleotide_sorting_order
std::array< size_t, 4 > nucleotide_sorting_order(std::array< T, 4 > const &values)
Return the sorting order of four values, for instance of the four nucleotides ACGT,...
Definition: population/function/functions.hpp:128
genesis::population::guess_and_set_ref_and_alt_bases
void guess_and_set_ref_and_alt_bases(Variant &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the reference and alternative bases for a Variant, and set them.
Definition: population/function/functions.cpp:515
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::sorted_average_sample_counts
std::pair< SortedSampleCounts, SortedSampleCounts > sorted_average_sample_counts(SampleCounts const &sample_a, SampleCounts const &sample_b)
Return the sorted base counts of both input samples, orderd by the average frequencies of the nucleot...
Definition: population/function/functions.cpp:164
genesis::population::consensus
std::pair< char, double > consensus(SampleCounts const &sample)
Consensus character for a SampleCounts, and its confidence.
Definition: population/function/functions.cpp:428
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:80
genesis::population::guess_reference_base
char guess_reference_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the reference base of a Variant.
Definition: population/function/functions.cpp:478
genesis::utils::contains
bool contains(const C &v, const T &x)
Returns whether a container object has a certain element.
Definition: algorithm.hpp:74
genesis::population::SampleCounts::d_count
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: sample_counts.hpp:91
genesis::sequence::ReferenceGenome
Lookup of Sequences of a reference genome.
Definition: reference_genome.hpp:65
genesis::population::sorted_sample_counts
SortedSampleCounts sorted_sample_counts(SampleCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
Definition: population/function/functions.cpp:134
genesis::population::SampleCounts::size_type
size_t size_type
Public alias for the size type that the class uses to store its counts.
Definition: sample_counts.hpp:61
codes.hpp
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:73
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76