A library for working with phylogenetic and population genetic data.
v0.32.0
simple_pileup_common.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <cassert>
37 #include <stdexcept>
38 
39 namespace genesis {
40 namespace population {
41 
42 // =================================================================================================
43 // Simple (m)pileup Reader Helper Functions
44 // =================================================================================================
45 
47  SimplePileupReader::Sample const& sample,
48  unsigned char min_phred_score
49 ) {
50  SampleCounts result;
51 
52  // Tally up the bases.
53  size_t total_count = 0;
54  size_t skip_count = 0;
55  size_t rna_count = 0;
56  for( size_t i = 0; i < sample.read_bases.size(); ++i ) {
57 
58  // Quality control if available. Skip bases that are below the threshold.
59  if( !sample.phred_scores.empty() && sample.phred_scores[i] < min_phred_score ) {
60  ++skip_count;
61  continue;
62  }
63 
64  ++total_count;
65  switch( sample.read_bases[i] ) {
66  case 'a':
67  case 'A': {
68  ++result.a_count;
69  break;
70  }
71  case 'c':
72  case 'C': {
73  ++result.c_count;
74  break;
75  }
76  case 'g':
77  case 'G': {
78  ++result.g_count;
79  break;
80  }
81  case 't':
82  case 'T': {
83  ++result.t_count;
84  break;
85  }
86  case 'n':
87  case 'N': {
88  ++result.n_count;
89  break;
90  }
91  case '*':
92  case '#': {
93  ++result.d_count;
94  break;
95  }
96  case '<':
97  case '>': {
98  // Skipping RNA symbols. But count them, for sanity check.
99  (void) rna_count;
100  ++rna_count;
101  break;
102  }
103  default: {
104  throw std::runtime_error(
105  "Malformed pileup sample: Invalid allele character " +
106  utils::char_to_hex( sample.read_bases[i] )
107  );
108  }
109  }
110  }
111 
112  // Sanity checks and assertions.
113  (void) total_count;
114  assert(
115  total_count ==
116  result.a_count + result.c_count + result.g_count + result.t_count +
117  result.n_count + result.d_count + rna_count
118  );
119  assert( skip_count + total_count == sample.read_bases.size() );
120 
121  // Sum sanity checks. There seems to be a very weird special case (found in the PoPoolation2
122  // test dataset) where a line contains a deletion with a low phred score (`*`) that is not
123  // counted in the "Number of reads covering this position" counter:
124  // ` 89795 2R 113608 N 1 T$ A 0 * *`
125  // We account for this here by allowing exactly one such base that is either a deletion
126  // or a skip due to low phred score. There is no information that we know of about how
127  // "empty" lines should be treated in pileip, so we have to guess, and that here seems to work.
128  auto const count_sum =
129  result.a_count + result.c_count + result.g_count + result.t_count + result.n_count
130  ;
131  if(
132  sample.read_bases.size() != sample.read_depth &&
133  !( count_sum == 0 && result.d_count + skip_count == 1 )
134  ) {
135 
136  throw std::runtime_error(
137  "Malformed pileup sample: Given read depth (" + std::to_string( sample.read_depth ) +
138  ") does not match the number of bases found in the sample (" +
139  std::to_string( sample.read_bases.size() ) + ")"
140  );
141  }
142 
143  return result;
144 }
145 
147  SimplePileupReader::Record const& record,
148  unsigned char min_phred_score
149 ) {
150  // Set basic data
151  Variant result;
152  result.chromosome = record.chromosome;
153  result.position = record.position;
154  result.reference_base = utils::to_upper( record.reference_base );
155 
156  // Convert the individual samples
157  result.samples.reserve( record.samples.size() );
158  for( size_t i = 0; i < record.samples.size(); ++i ) {
159  result.samples.push_back( convert_to_sample_counts( record.samples[i], min_phred_score ));
160  }
161 
162  // Pileup does not contain ALT bases, so infer them from counts,
163  // using the base with the most counts that is not the reference base.
164  // We only do this if we have a reference base though, as otherwise, the sorting and alternative
165  // is meaningless anyway. Only need to check upper case here, as we converted above.
166  // Also, we do not set the alt base if it does not have any counts, and in that case is also
167  // meaningless to have an alt base.
168  if( is_valid_base( result.reference_base )) {
169  auto const sorted = sorted_sample_counts( result, true, SampleCountsFilterPolicy::kAll );
170  if( sorted[1].count > 0 ) {
171  result.alternative_base = utils::to_upper( sorted[1].base );
172  }
173  }
174 
175  return result;
176 }
177 
179  std::shared_ptr< utils::BaseInputSource > source,
180  size_t max_lines
181 ) {
182  // Make a reader that uses quality scores.
183  SimplePileupReader reader;
184  reader.with_quality_string( true );
185 
186  // Start an iterator over the input.
187  size_t line_cnt = 0;
189  while( iter ) {
190  ++iter;
191  ++line_cnt;
192  if( max_lines > 0 && line_cnt >= max_lines ) {
193  break;
194  }
195  }
196 
197  // Now get the accumulated counts of all quality codes, and guess the encoding.
198  return genesis::sequence::guess_quality_encoding( iter.reader().quality_code_counts() );
199 }
200 
201 } // namespace population
202 } // namespace genesis
functions.hpp
genesis::population::convert_to_variant
Variant convert_to_variant(SimplePileupReader::Record const &record, unsigned char min_phred_score)
Definition: simple_pileup_common.cpp:146
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SimplePileupReader
Reader for line-by-line assessment of (m)pileup files.
Definition: simple_pileup_reader.hpp:78
genesis::population::SimplePileupReader::Record::position
size_t position
Definition: simple_pileup_reader.hpp:155
genesis::population::SampleCounts::a_count
size_type a_count
Count of all A nucleotides that are present in the sample.
Definition: sample_counts.hpp:66
genesis::population::Variant::position
size_t position
Definition: variant.hpp:74
genesis::population::SimplePileupReader::Sample::read_depth
size_t read_depth
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:117
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:79
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
genesis::population::SampleCounts::t_count
size_type t_count
Count of all T nucleotides that are present in the sample.
Definition: sample_counts.hpp:81
simple_pileup_input_stream.hpp
genesis::population::SampleCountsFilterPolicy::kAll
@ kAll
genesis::population::SimplePileupInputStream
Iterate an input source and parse it as a (m)pileup file.
Definition: simple_pileup_input_stream.hpp:79
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:103
genesis::population::SampleCounts::n_count
size_type n_count
Count of all N (undetermined/any) nucleotides that are present in the sample.
Definition: sample_counts.hpp:86
genesis::population::convert_to_sample_counts
SampleCounts convert_to_sample_counts(SimplePileupReader::Sample const &sample, unsigned char min_phred_score)
Definition: simple_pileup_common.cpp:46
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:152
genesis::population::SimplePileupReader::Record::reference_base
char reference_base
Definition: simple_pileup_reader.hpp:156
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::population::SampleCounts::c_count
size_type c_count
Count of all C nucleotides that are present in the sample.
Definition: sample_counts.hpp:71
genesis::population::is_valid_base
constexpr bool is_valid_base(char c)
Return whether a given base is in ACGT, case insensitive.
Definition: population/function/functions.hpp:56
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::guess_quality_encoding
QualityEncoding guess_quality_encoding(std::array< size_t, 128 > const &char_counts)
Guess the quality score encoding, based on counts of how often each char appeared in the quality stri...
Definition: quality.cpp:433
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:126
genesis::population::SimplePileupReader::Record::chromosome
std::string chromosome
Definition: simple_pileup_reader.hpp:154
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
genesis::population::SimplePileupReader::Record::samples
std::vector< Sample > samples
Definition: simple_pileup_reader.hpp:157
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:80
genesis::population::guess_pileup_quality_encoding
genesis::sequence::QualityEncoding guess_pileup_quality_encoding(std::shared_ptr< utils::BaseInputSource > source, size_t max_lines)
Guess the quality score encoding for an (m)pileup input, based on counts of how often each char appea...
Definition: simple_pileup_common.cpp:178
genesis::population::SampleCounts::d_count
size_type d_count
Count of all deleted (*) nucleotides that are present in the sample.
Definition: sample_counts.hpp:91
genesis::population::sorted_sample_counts
SortedSampleCounts sorted_sample_counts(SampleCounts const &sample)
Return the order of base counts (nucleotides), largest one first.
Definition: population/function/functions.cpp:134
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:135
simple_pileup_common.hpp
genesis::population::SimplePileupReader::with_quality_string
bool with_quality_string() const
Definition: simple_pileup_reader.hpp:313
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:73
genesis::population::SampleCounts::g_count
size_type g_count
Count of all G nucleotides that are present in the sample.
Definition: sample_counts.hpp:76