A library for working with phylogenetic and population genetic data.
v0.27.0
simple_pileup_common.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
34 
35 #include <cassert>
36 #include <stdexcept>
37 
38 namespace genesis {
39 namespace population {
40 
41 // =================================================================================================
42 // Simple (m)pileup Reader Helper Functions
43 // =================================================================================================
44 
46  SimplePileupReader::Sample const& sample,
47  unsigned char min_phred_score
48 ) {
49  BaseCounts result;
50 
51  // Tally up the bases.
52  size_t total_count = 0;
53  size_t skip_count = 0;
54  size_t rna_count = 0;
55  for( size_t i = 0; i < sample.read_bases.size(); ++i ) {
56 
57  // Quality control if available. Skip bases that are below the threshold.
58  if( !sample.phred_scores.empty() && sample.phred_scores[i] < min_phred_score ) {
59  ++skip_count;
60  continue;
61  }
62 
63  ++total_count;
64  switch( sample.read_bases[i] ) {
65  case 'a':
66  case 'A': {
67  ++result.a_count;
68  break;
69  }
70  case 'c':
71  case 'C': {
72  ++result.c_count;
73  break;
74  }
75  case 'g':
76  case 'G': {
77  ++result.g_count;
78  break;
79  }
80  case 't':
81  case 'T': {
82  ++result.t_count;
83  break;
84  }
85  case 'n':
86  case 'N': {
87  ++result.n_count;
88  break;
89  }
90  case '*':
91  case '#': {
92  ++result.d_count;
93  break;
94  }
95  case '<':
96  case '>': {
97  // Skipping RNA symbols. But count them, for sanity check.
98  (void) rna_count;
99  ++rna_count;
100  break;
101  }
102  default: {
103  throw std::runtime_error(
104  "Malformed pileup sample: Invalid allele character " +
105  utils::char_to_hex( sample.read_bases[i] )
106  );
107  }
108  }
109  }
110 
111  // Sanity checks and assertions.
112  (void) total_count;
113  assert(
114  total_count ==
115  result.a_count + result.c_count + result.g_count + result.t_count +
116  result.n_count + result.d_count + rna_count
117  );
118  assert( skip_count + total_count == sample.read_bases.size() );
119 
120  // Sum sanity checks. There seems to be a very weird special case (found in the PoPoolation2
121  // test dataset) where a line contains a deletion with a low phred score (`*`) that is not
122  // counted in the "Number of reads covering this position" counter:
123  // ` 89795 2R 113608 N 1 T$ A 0 * *`
124  // We account for this here by allowing exactly one such base that is either a deletion
125  // or a skip due to low phred score. There is no information that we know of about how
126  // "empty" lines should be treated in pileip, so we have to guess, and that here seems to work.
127  auto const base_count =
128  result.a_count + result.c_count + result.g_count + result.t_count + result.n_count
129  ;
130  if(
131  sample.read_bases.size() != sample.read_coverage &&
132  !( base_count == 0 && result.d_count + skip_count == 1 )
133  ) {
134 
135  throw std::runtime_error(
136  "Malformed pileup sample: Given read count (" + std::to_string( sample.read_coverage ) +
137  ") does not match the number of bases found in the sample (" +
138  std::to_string( sample.read_bases.size() ) + ")"
139  );
140  }
141 
142  return result;
143 }
144 
146  SimplePileupReader::Record const& record,
147  unsigned char min_phred_score
148 ) {
149  // Set basic data
150  Variant result;
151  result.chromosome = record.chromosome;
152  result.position = record.position;
153  result.reference_base = utils::to_upper( record.reference_base );
154 
155  // Convert the individual samples
156  result.samples.reserve( record.samples.size() );
157  for( size_t i = 0; i < record.samples.size(); ++i ) {
158  result.samples.push_back( convert_to_base_counts( record.samples[i], min_phred_score ));
159  }
160 
161  // Pileup does not contain ALT bases, so infer them from counts,
162  // using the base with the most counts that is not the reference base.
163  // We only do this if we have a reference base though, as otherwise, the sorting and alternative
164  // is meaningless anyway. Only need to check upper case here, as we converted above.
165  // Also, we do not set the alt base if it does not have any counts, and in that case is also
166  // meaningless to have an alt base.
167  if(
168  result.reference_base == 'A' ||
169  result.reference_base == 'C' ||
170  result.reference_base == 'G' ||
171  result.reference_base == 'T'
172  ) {
173  auto const sorted = sorted_base_counts( result, true );
174  if( sorted[1].count > 0 ) {
175  result.alternative_base = utils::to_upper( sorted[1].base );
176  }
177  }
178 
179  return result;
180 }
181 
182 } // namespace population
183 } // namespace genesis
genesis::population::BaseCounts::d_count
size_t d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: base_counts.hpp:84
genesis::population::convert_to_variant
Variant convert_to_variant(SimplePileupReader::Record const &record, unsigned char min_phred_score)
Definition: simple_pileup_common.cpp:145
genesis::population::BaseCounts::t_count
size_t t_count
Count of all T nucleotides that are present in the sample.
Definition: base_counts.hpp:74
genesis::population::SimplePileupReader::Record::position
size_t position
Definition: simple_pileup_reader.hpp:154
genesis::population::Variant::position
size_t position
Definition: variant.hpp:65
genesis::population::BaseCounts::g_count
size_t g_count
Count of all G nucleotides that are present in the sample.
Definition: base_counts.hpp:69
genesis::population::BaseCounts::a_count
size_t a_count
Count of all A nucleotides that are present in the sample.
Definition: base_counts.hpp:59
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:66
genesis::population::SimplePileupReader::Sample::read_coverage
size_t read_coverage
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:116
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::BaseCounts::n_count
size_t n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: base_counts.hpp:79
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:102
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:151
genesis::population::SimplePileupReader::Record::reference_base
char reference_base
Definition: simple_pileup_reader.hpp:155
genesis::population::convert_to_base_counts
BaseCounts convert_to_base_counts(SimplePileupReader::Sample const &sample, unsigned char min_phred_score)
Definition: simple_pileup_common.cpp:45
genesis::population::Variant::samples
std::vector< BaseCounts > samples
Definition: variant.hpp:69
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis::population::BaseCounts::c_count
size_t c_count
Count of all C nucleotides that are present in the sample.
Definition: base_counts.hpp:64
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:125
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:54
genesis::population::SimplePileupReader::Record::chromosome
std::string chromosome
Definition: simple_pileup_reader.hpp:153
genesis::population::sorted_base_counts
SortedBaseCounts sorted_base_counts(BaseCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
Definition: population/functions/functions.cpp:191
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::SimplePileupReader::Record::samples
std::vector< Sample > samples
Definition: simple_pileup_reader.hpp:156
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:67
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:134
functions.hpp
simple_pileup_common.hpp
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:64