|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
44 namespace population {
81 throw std::runtime_error(
123 throw std::runtime_error(
146 if( result[0].count < result[1].count ) {
149 if( result[2].count < result[3].count ) {
152 if( result[0].count < result[2].count ) {
155 if( result[1].count < result[3].count ) {
158 if( result[1].count < result[2].count ) {
169 auto result = std::pair<SortedSampleCounts, SortedSampleCounts>{};
172 size_t const s1_cnts[] = {
175 double const s1_nt_cnt =
static_cast<double>(
nucleotide_sum( sample_a ));
176 double const s1_freqs[] = {
177 static_cast<double>( sample_a.
a_count ) / s1_nt_cnt,
178 static_cast<double>( sample_a.
c_count ) / s1_nt_cnt,
179 static_cast<double>( sample_a.
g_count ) / s1_nt_cnt,
180 static_cast<double>( sample_a.
t_count ) / s1_nt_cnt
184 size_t const s2_cnts[] = {
187 double const s2_nt_cnt =
static_cast<double>(
nucleotide_sum( sample_b ));
188 double const s2_freqs[] = {
189 static_cast<double>( sample_b.
a_count ) / s2_nt_cnt,
190 static_cast<double>( sample_b.
c_count ) / s2_nt_cnt,
191 static_cast<double>( sample_b.
g_count ) / s2_nt_cnt,
192 static_cast<double>( sample_b.
t_count ) / s2_nt_cnt
198 if( s1_nt_cnt == 0.0 || s2_nt_cnt == 0.0 ) {
203 std::array<double, 4>
const avg_freqs = {{
204 ( s1_freqs[0] + s2_freqs[0] ) / 2.0,
205 ( s1_freqs[1] + s2_freqs[1] ) / 2.0,
206 ( s1_freqs[2] + s2_freqs[2] ) / 2.0,
207 ( s1_freqs[3] + s2_freqs[3] ) / 2.0
214 assert( avg_freqs[order[0]] >= avg_freqs[order[1]] );
215 assert( avg_freqs[order[1]] >= avg_freqs[order[2]] );
216 assert( avg_freqs[order[2]] >= avg_freqs[order[3]] );
219 static const char nts[] = {
'A',
'C',
'G',
'T'};
220 result.first[0] = { nts[order[0]], s1_cnts[order[0]] };
221 result.first[1] = { nts[order[1]], s1_cnts[order[1]] };
222 result.first[2] = { nts[order[2]], s1_cnts[order[2]] };
223 result.first[3] = { nts[order[3]], s1_cnts[order[3]] };
224 result.second[0] = { nts[order[0]], s2_cnts[order[0]] };
225 result.second[1] = { nts[order[1]], s2_cnts[order[1]] };
226 result.second[2] = { nts[order[2]], s2_cnts[order[2]] };
227 result.second[3] = { nts[order[3]], s2_cnts[order[3]] };
241 if( reference_first ) {
246 result[0] = {
'A', total.
a_count };
247 result[1] = {
'C', total.
c_count };
248 result[2] = {
'G', total.
g_count };
249 result[3] = {
'T', total.
t_count };
254 result[0] = {
'C', total.
c_count };
255 result[1] = {
'A', total.
a_count };
256 result[2] = {
'G', total.
g_count };
257 result[3] = {
'T', total.
t_count };
262 result[0] = {
'G', total.
g_count };
263 result[1] = {
'A', total.
a_count };
264 result[2] = {
'C', total.
c_count };
265 result[3] = {
'T', total.
t_count };
270 result[0] = {
'T', total.
t_count };
271 result[1] = {
'A', total.
a_count };
272 result[2] = {
'C', total.
c_count };
273 result[3] = {
'G', total.
g_count };
277 throw std::runtime_error(
279 "to sort base counts."
283 if( result[1].count < result[2].count ) {
286 if( result[1].count < result[3].count ) {
289 if( result[2].count < result[3].count ) {
313 static_assert(
static_cast<int>(
true ) == 1,
"Invalid bool(true) to int(1) conversion." );
314 static_assert(
static_cast<int>(
false ) == 0,
"Invalid bool(false) to int(0) conversion." );
319 al_count +=
static_cast<int>( sample.
a_count > 0 );
320 al_count +=
static_cast<int>( sample.
c_count > 0 );
321 al_count +=
static_cast<int>( sample.
g_count > 0 );
322 al_count +=
static_cast<int>( sample.
t_count > 0 );
325 assert( al_count <= 4 );
334 if( min_count == 0 ) {
341 al_count +=
static_cast<int>( sample.
a_count >= min_count );
342 al_count +=
static_cast<int>( sample.
c_count >= min_count );
343 al_count +=
static_cast<int>( sample.
g_count >= min_count );
344 al_count +=
static_cast<int>( sample.
t_count >= min_count );
347 assert( al_count <= 4 );
354 if( max_count == 0 ) {
357 if( max_count < min_count ) {
358 throw std::invalid_argument(
359 "Cannot compute allele_count() with max_count < min_count. "
366 if( min_count == 0 ) {
372 al_count +=
static_cast<int>( sample.
a_count >= min_count && sample.
a_count <= max_count );
373 al_count +=
static_cast<int>( sample.
c_count >= min_count && sample.
c_count <= max_count );
374 al_count +=
static_cast<int>( sample.
g_count >= min_count && sample.
g_count <= max_count );
375 al_count +=
static_cast<int>( sample.
t_count >= min_count && sample.
t_count <= max_count );
410 for(
auto const& ps : p ) {
435 if( nucleotide_count == 0 ) {
438 assert( nucleotide_count > 0 );
450 if( counts[1] > counts[max_idx] ) {
453 if( counts[2] > counts[max_idx] ) {
456 if( counts[3] > counts[max_idx] ) {
461 static const char nts[] = {
'A',
'C',
'G',
'T'};
463 =
static_cast<double>( counts[max_idx] )
464 /
static_cast<double>( nucleotide_count )
466 return { nts[max_idx], conf };
486 if( sorted[0].count > 0 ) {
505 if( sorted[1].count > 0 ) {
524 bool computed_total =
false;
535 computed_total =
true;
539 if( sorted[0].count > 0 ) {
553 if( ! computed_total ) {
559 if( sorted[1].count > 0 ) {
574 throw std::runtime_error(
"Invalid position 0 in Variant." );
582 if( var_ref_base !=
'N' && var_ref_base != ref_base ) {
583 throw std::runtime_error(
584 "At chromosome \"" + variant.
chromosome +
"\" position " +
586 std::string( 1, ref_base ) +
"', while the Variant already has mismatching base '" +
587 std::string( 1, var_ref_base ) +
"' set"
613 throw std::runtime_error(
614 "At chromosome \"" + variant.
chromosome +
"\" position " +
616 std::string( 1, ref_base ) +
"', which is not a valid nucleic acid code"
620 throw std::runtime_error(
621 "At chromosome \"" + variant.
chromosome +
"\" position " +
623 std::string( 1, var_ref_base ) +
"' and the alternative base is '" +
624 std::string( 1, var_alt_base ) +
625 "', determined from nucleotide counts in the data at this position. " +
626 "However, the Reference Genome has base '" + std::string( 1, ref_base ) +
627 "', which does not code for either of them, " +
628 "and hence likely points to some kind of mismatch"
void swap(Sample &lhs, Sample &rhs)
char get_base(std::string const &chromosome, size_t position, bool to_upper=true) const
Get a particular base at a given chromosome and position.
SampleCounts merge(SampleCounts const &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss.
std::ostream & operator<<(std::ostream &os, SampleCounts const &bs)
Output stream operator for SampleCounts instances.
void merge_inplace(SampleCounts &p1, SampleCounts const &p2)
Merge the counts of two SampleCountss, by adding the counts of the second (p2) to the first (p1).
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
bool nucleic_acid_code_containment(char a, char b, bool undetermined_matches_all)
Compare two nucleic acid codes and check if they are equal, taking degenerated/ambiguous characters i...
size_type a_count
Count of all A nucleotides that are present in the sample.
void set_base_count(SampleCounts &sample, char base, SampleCounts::size_type value)
Set the count for a base given as a char.
constexpr size_t nucleotide_sum(SampleCounts const &sample)
Count of the pure nucleotide bases at this position, that is, the sum of all A, C,...
size_t allele_count(SampleCounts const &sample)
Return the number of alleles, that is, of non-zero nucleotide counts of the sample.
char guess_alternative_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the alternative base of a Variant.
SortedSampleCounts sorted_sample_counts_(Variant const &variant, bool reference_first, SampleCounts const &total)
Local helper function that takes an already computed total from merge_sample_counts(),...
SampleCounts::size_type get_base_count(SampleCounts const &sample, char base)
Get the count for a base given as a char.
bool is_covered(GenomeRegion const ®ion, std::string const &chromosome, size_t position)
Test whether the chromosome/position is within a given genomic region.
std::string to_string(GenomeLocus const &locus)
SampleCountsFilterPolicy
Policy helper to decide how to treat filtered SampleCounts.
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
size_type t_count
Count of all T nucleotides that are present in the sample.
Ordered array of sample counts for the four nucleotides.
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
SampleCounts merge_sample_counts(Variant const &v, SampleCountsFilterPolicy filter_policy)
Merge the counts of a vector SampleCountss.
size_type c_count
Count of all C nucleotides that are present in the sample.
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
std::array< size_t, 4 > nucleotide_sorting_order(std::array< T, 4 > const &values)
Return the sorting order of four values, for instance of the four nucleotides ACGT,...
void guess_and_set_ref_and_alt_bases(Variant &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the reference and alternative bases for a Variant, and set them.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
std::pair< SortedSampleCounts, SortedSampleCounts > sorted_average_sample_counts(SampleCounts const &sample_a, SampleCounts const &sample_b)
Return the sorted base counts of both input samples, orderd by the average frequencies of the nucleot...
std::pair< char, double > consensus(SampleCounts const &sample)
Consensus character for a SampleCounts, and its confidence.
char guess_reference_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the reference base of a Variant.
bool contains(const C &v, const T &x)
Returns whether a container object has a certain element.
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Lookup of Sequences of a reference genome.
SortedSampleCounts sorted_sample_counts(SampleCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
size_t size_type
Public alias for the size type that the class uses to store its counts.
size_type g_count
Count of all G nucleotides that are present in the sample.