A library for working with phylogenetic and population genetic data.
v0.32.0
population/function/functions.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_FUNCTIONS_H_
2 #define GENESIS_POPULATION_FUNCTION_FUNCTIONS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
39 
40 #include <array>
41 #include <iosfwd>
42 #include <utility>
43 #include <string>
44 #include <vector>
45 
46 namespace genesis {
47 namespace population {
48 
49 // =================================================================================================
50 // Bases and Counts
51 // =================================================================================================
52 
56 inline constexpr bool is_valid_base( char c )
57 {
58  // Can't use a function call here, to comply with C++11 constexpr rules.
59  // c = utils::to_upper( c );
60  return (
61  c == 'A' || c == 'a' ||
62  c == 'C' || c == 'c' ||
63  c == 'G' || c == 'g' ||
64  c == 'T' || c == 't'
65  );
66 }
67 
71 inline constexpr bool is_valid_base_or_n( char c )
72 {
73  // Can't use a function call here, to comply with C++11 constexpr rules.
74  // c = utils::to_upper( c );
75  return (
76  c == 'A' || c == 'a' ||
77  c == 'C' || c == 'c' ||
78  c == 'G' || c == 'g' ||
79  c == 'T' || c == 't' ||
80  c == 'N' || c == 'n'
81  );
82 }
83 
89 SampleCounts::size_type get_base_count( SampleCounts const& sample, char base );
90 
96 void set_base_count( SampleCounts& sample, char base, SampleCounts::size_type value );
97 
98 // =================================================================================================
99 // Sorting
100 // =================================================================================================
101 
127 template<typename T>
128 std::array<size_t, 4> nucleotide_sorting_order( std::array<T, 4> const& values )
129 {
130  // Sort quickly via sorting network, putting large values first.
131  // See https://stackoverflow.com/a/25070688/4184258
132  auto indices = std::array<size_t, 4>{{ 0, 1, 2, 3 }};
133  if( values[indices[0]] < values[indices[1]] ) {
134  std::swap( indices[0], indices[1] );
135  }
136  if( values[indices[2]] < values[indices[3]] ) {
137  std::swap( indices[2], indices[3] );
138  }
139  if( values[indices[0]] < values[indices[2]] ) {
140  std::swap( indices[0], indices[2] );
141  }
142  if( values[indices[1]] < values[indices[3]] ) {
143  std::swap( indices[1], indices[3] );
144  }
145  if( values[indices[1]] < values[indices[2]] ) {
146  std::swap( indices[1], indices[2] );
147  }
148 
149  // Now they are sorted, largest ones first.
150  assert( values[indices[0]] >= values[indices[1]] );
151  assert( values[indices[1]] >= values[indices[2]] );
152  assert( values[indices[2]] >= values[indices[3]] );
153 
154  return indices;
155 }
156 
164 template<typename T>
165 std::array<size_t, 6> sample_counts_sorting_order( std::array<T, 6> const& v )
166 {
167  // Implementation inspired by https://stackoverflow.com/a/2792216/4184258
168  // We did not test if this is faster than a sorting network here. Fast enough for now anyway.
169 
170  // Obtain the index at which each value ends up in the sorting order, largest one first.
171  int i0 = (v[0] < v[1]) + (v[0] < v[2]) + (v[0] < v[3]) + (v[0] < v[4]) + (v[0] < v[5]);
172  int i1 = (v[1] <= v[0]) + (v[1] < v[2]) + (v[1] < v[3]) + (v[1] < v[4]) + (v[1] < v[5]);
173  int i2 = (v[2] <= v[0]) + (v[2] <= v[1]) + (v[2] < v[3]) + (v[2] < v[4]) + (v[2] < v[5]);
174  int i3 = (v[3] <= v[0]) + (v[3] <= v[1]) + (v[3] <= v[2]) + (v[3] < v[4]) + (v[3] < v[5]);
175  int i4 = (v[4] <= v[0]) + (v[4] <= v[1]) + (v[4] <= v[2]) + (v[4] <= v[3]) + (v[4] < v[5]);
176  int i5 = (v[5] <= v[0]) + (v[5] <= v[1]) + (v[5] <= v[2]) + (v[5] <= v[3]) + (v[5] <= v[4]);
177  assert( i0 + i1 + i2 + i3 + i4 + i5 == 15 );
178 
179  // At those indices in the result, set the position that they need to point to.
180  std::array<size_t, 6> order;
181  order[i0] = 0;
182  order[i1] = 1;
183  order[i2] = 2;
184  order[i3] = 3;
185  order[i4] = 4;
186  order[i5] = 5;
187 
188  // Now everything is sorted.
189  assert( v[order[0]] >= v[order[1]] );
190  assert( v[order[1]] >= v[order[2]] );
191  assert( v[order[2]] >= v[order[3]] );
192  assert( v[order[3]] >= v[order[4]] );
193  assert( v[order[4]] >= v[order[5]] );
194  return order;
195 }
196 
200 SortedSampleCounts sorted_sample_counts( SampleCounts const& sample );
201 
209 std::pair<SortedSampleCounts, SortedSampleCounts> sorted_average_sample_counts(
210  SampleCounts const& sample_a,
211  SampleCounts const& sample_b
212 );
213 
221 SortedSampleCounts sorted_sample_counts(
222  Variant const& variant, bool reference_first, SampleCountsFilterPolicy filter_policy
223 );
224 
225 // =================================================================================================
226 // Allele Count
227 // =================================================================================================
228 
236 size_t allele_count( SampleCounts const& sample );
237 
246 size_t allele_count( SampleCounts const& sample, size_t min_count );
247 
257 size_t allele_count( SampleCounts const& sample, size_t min_count, size_t max_count );
258 
259 // =================================================================================================
260 // Merging
261 // =================================================================================================
262 
267 void merge_inplace( SampleCounts& p1, SampleCounts const& p2 );
268 
272 SampleCounts merge( SampleCounts const& p1, SampleCounts const& p2 );
273 
277 SampleCounts merge( std::vector<SampleCounts> const& p, SampleCountsFilterPolicy filter_policy );
278 
283 {
284  return merge( v.samples, filter_policy );
285 }
286 
296 inline constexpr size_t nucleotide_sum( SampleCounts const& sample )
297 {
298  return sample.a_count + sample.c_count + sample.g_count + sample.t_count;
299 }
300 
306 inline size_t total_nucleotide_sum( Variant const& variant, SampleCountsFilterPolicy filter_policy )
307 {
308  return nucleotide_sum( merge_sample_counts( variant, filter_policy ));
309 }
310 
318 inline constexpr size_t sample_counts_sum( SampleCounts const& sample )
319 {
320  return
321  sample.a_count +
322  sample.c_count +
323  sample.g_count +
324  sample.t_count +
325  sample.n_count +
326  sample.d_count
327  ;
328 }
329 
335 inline size_t total_sample_counts_sum( Variant const& variant, SampleCountsFilterPolicy filter_policy )
336 {
337  return sample_counts_sum( merge_sample_counts( variant, filter_policy ));
338 }
339 
340 // =================================================================================================
341 // Consensus
342 // =================================================================================================
343 
353 std::pair<char, double> consensus( SampleCounts const& sample );
354 
364 std::pair<char, double> consensus( SampleCounts const& sample, bool is_covered );
365 
376  Variant const& variant,
377  bool force = false,
379 );
380 
395  Variant const& variant,
396  bool force = false,
398 );
399 
407  Variant& variant,
408  bool force = false,
410 );
411 
430  Variant& variant,
431  char ref_base,
432  bool force = false,
434 );
435 
444  Variant& variant,
445  genesis::sequence::ReferenceGenome const& ref_genome,
446  bool force = false,
448 );
449 
450 // =================================================================================================
451 // Output
452 // =================================================================================================
453 
457 std::ostream& operator<<( std::ostream& os, SampleCounts const& bs );
458 
459 } // namespace population
460 } // namespace genesis
461 
462 #endif // include guard
genesis::placement::swap
void swap(Sample &lhs, Sample &rhs)
Definition: sample.cpp:104
genesis::population::total_sample_counts_sum
size_t total_sample_counts_sum(Variant const &variant, SampleCountsFilterPolicy filter_policy)
Sum up all the base counts at this sample, that is, the sum of all A, C, G, T, as well as the N and D...
Definition: population/function/functions.hpp:335
genesis::population::merge
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
Definition: population/function/functions.cpp:400
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, SampleCounts const &bs)
Output stream operator for SampleCounts instances.
Definition: population/function/functions.cpp:649
genesis::population::merge_inplace
void merge_inplace(SampleCounts &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss, by adding the counts of the second (p2) to the first (p1).
Definition: population/function/functions.cpp:383
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
genesis::population::set_base_count
void set_base_count(SampleCounts &sample, char base, SampleCounts::size_type value)
Set the count for a base given as a char.
Definition: population/function/functions.cpp:86
genesis::population::nucleotide_sum
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:296
genesis::population::allele_count
size_t allele_count(SampleCounts const &sample)
Return the number of alleles, that is, of non-zero nucleotide counts of the sample.
Definition: population/function/functions.cpp:309
genesis::population::guess_alternative_base
char guess_alternative_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the alternative base of a Variant.
Definition: population/function/functions.cpp:495
genesis::population::get_base_count
SampleCounts::size_type get_base_count(SampleCounts const &sample, char base)
Get the count for a base given as a char.
Definition: population/function/functions.cpp:50
genesis::population::is_covered
bool is_covered(GenomeRegion const &region, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
Definition: genome_region.cpp:207
genesis::population::SampleCountsFilterPolicy
SampleCountsFilterPolicy
Policy helper to decide how to treat filtered SampleCounts.
Definition: sample_counts_filter.hpp:211
sample_counts_filter.hpp
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
sample_counts.hpp
genesis::population::is_valid_base_or_n
constexpr bool is_valid_base_or_n(char c)
Return whether a given base is in ACGTN, case insensitive.
Definition: population/function/functions.hpp:71
genesis::population::SampleCounts::n_count
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: sample_counts.hpp:86
genesis::population::merge_sample_counts
SampleCounts merge_sample_counts(Variant const &v, SampleCountsFilterPolicy filter_policy)
Merge the counts of a vector SampleCountss.
Definition: population/function/functions.hpp:282
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis::population::is_valid_base
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
Definition: population/function/functions.hpp:56
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
reference_genome.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SampleCountsFilterPolicy::kOnlyPassing
@ kOnlyPassing
char.hpp
genesis::population::nucleotide_sorting_order
std::array< size_t, 4 > nucleotide_sorting_order(std::array< T, 4 > const &values)
Return the sorting order of four values, for instance of the four nucleotides ACGT,...
Definition: population/function/functions.hpp:128
genesis::population::guess_and_set_ref_and_alt_bases
void guess_and_set_ref_and_alt_bases(Variant &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the reference and alternative bases for a Variant, and set them.
Definition: population/function/functions.cpp:515
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::sorted_average_sample_counts
std::pair< SortedSampleCounts, SortedSampleCounts > sorted_average_sample_counts(SampleCounts const &sample_a, SampleCounts const &sample_b)
Return the sorted base counts of both input samples, orderd by the average frequencies of the nucleot...
Definition: population/function/functions.cpp:164
genesis::population::consensus
std::pair< char, double > consensus(SampleCounts const &sample)
Consensus character for a SampleCounts, and its confidence.
Definition: population/function/functions.cpp:428
genesis::population::total_nucleotide_sum
size_t total_nucleotide_sum(Variant const &variant, SampleCountsFilterPolicy filter_policy)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
Definition: population/function/functions.hpp:306
variant.hpp
genesis::population::guess_reference_base
char guess_reference_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the reference base of a Variant.
Definition: population/function/functions.cpp:478
genesis::population::SampleCounts::d_count
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: sample_counts.hpp:91
genesis::sequence::ReferenceGenome
Lookup of Sequences of a reference genome.
Definition: reference_genome.hpp:65
genesis::population::sorted_sample_counts
SortedSampleCounts sorted_sample_counts(SampleCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
Definition: population/function/functions.cpp:134
genesis::population::SampleCounts::size_type
size_t size_type
Public alias for the size type that the class uses to store its counts.
Definition: sample_counts.hpp:61
genesis::population::sample_counts_sum
constexpr size_t sample_counts_sum(SampleCounts const &sample)
Sum up all the base counts at this sample, that is, the sum of all A, C, G, T, as well as the N and D...
Definition: population/function/functions.hpp:318
genesis::population::sample_counts_sorting_order
std::array< size_t, 6 > sample_counts_sorting_order(std::array< T, 6 > const &v)
Return the sorting order of six values, for instance of the four nucleotides ACGT and the N and D cou...
Definition: population/function/functions.hpp:165
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76