A library for working with phylogenetic data.
v0.25.0
simple_pileup_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2021 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cstdlib>
41 #include <cstring>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace population {
46 
47 // =================================================================================================
48 // Reading & Parsing
49 // =================================================================================================
50 
51 std::vector<SimplePileupReader::Record> SimplePileupReader::read(
52  std::shared_ptr< utils::BaseInputSource > source
53 ) const {
54  std::vector<SimplePileupReader::Record> result;
55  utils::InputStream it( source );
56 
57  Record rec;
58  while( parse_line_( it, rec, {}, false )) {
59  result.push_back( rec );
60  }
61  return result;
62 }
63 
64 std::vector<SimplePileupReader::Record> SimplePileupReader::read(
65  std::shared_ptr< utils::BaseInputSource > source,
66  std::vector<size_t> const& sample_indices
67 ) const {
68  std::vector<SimplePileupReader::Record> result;
69  utils::InputStream it( source );
70 
71  // Convert the list of indices to a bool vec that tells which samples we want to process.
72  auto const sample_filter = make_sample_filter( sample_indices );
73 
74  Record rec;
75  while( parse_line_( it, rec, sample_filter, true )) {
76  result.push_back( rec );
77  }
78  return result;
79 }
80 
81 std::vector<SimplePileupReader::Record> SimplePileupReader::read(
82  std::shared_ptr< utils::BaseInputSource > source,
83  std::vector<bool> const& sample_filter
84 ) const {
85  std::vector<SimplePileupReader::Record> result;
86  utils::InputStream it( source );
87 
88  Record rec;
89  while( parse_line_( it, rec, sample_filter, true )) {
90  result.push_back( rec );
91  }
92  return result;
93 }
94 
96  utils::InputStream& input_stream,
98 ) const {
99  return parse_line_( input_stream, record, {}, false );
100 }
101 
103  utils::InputStream& input_stream,
105  std::vector<bool> const& sample_filter
106 ) const {
107  return parse_line_( input_stream, record, sample_filter, true );
108 }
109 
110 // =================================================================================================
111 // Helper Functions
112 // =================================================================================================
113 
114 std::vector<bool> SimplePileupReader::make_sample_filter( std::vector<size_t> const& indices )
115 {
116  // Get the largest element of the vector. If it's empty, we return an empty vector as well.
117  auto max_it = std::max_element( indices.begin(), indices.end() );
118  if( max_it == indices.end() ) {
119  return {};
120  }
121  size_t max_index = *max_it;
122 
123  // Fill a bool vector, setting all positions to true
124  // that are indicated by the indices, pun intended.
125  auto result = std::vector<bool>( max_index, false );
126  for( auto const& idx : indices ) {
127  assert( idx < result.size() );
128  result[idx] = true;
129  }
130  return result;
131 }
132 
133 // =================================================================================================
134 // Internal Members
135 // =================================================================================================
136 
137 // -------------------------------------------------------------------------
138 // Parse Line
139 // -------------------------------------------------------------------------
140 
141 bool SimplePileupReader::parse_line_(
142  utils::InputStream& input_stream,
143  Record& record,
144  std::vector<bool> const& sample_filter,
145  bool use_sample_filter
146 ) const {
147  // Shorthand.
148  auto& it = input_stream;
149 
150  // If we reached the end of the input stream, reset the record. We do not reset per default,
151  // in order to avoid costly re-initialization of the sample vector. But when we finish with
152  // an input stream, we want to reset, so that subsequent usage of this reader class does not
153  // fail if the pileip file contains a different number of samples.
154  // Still, the user will currently get an error when using the same reader instance to
155  // simultaneously (interlaced) read from multiple pileup files with differing number of samples
156  // into the same record... But who does that?! If you are a user having this issue, please
157  // let me know!
158  if( !it ) {
159  record = Record();
160  return false;
161  }
162  assert( it );
163  if( *it == '\n' ) {
164  throw std::runtime_error(
165  "Malformed pileup " + it.source_name() + " at " + it.at() + ": Invalid empty line"
166  );
167  }
168 
169  // Read chromosome.
171  record.chromosome = utils::read_while( it, utils::is_graph );
172  assert( !it || !utils::is_graph( *it ));
173 
174  // Read position.
175  next_field_( it );
176  record.position = utils::parse_unsigned_integer<size_t>( it );
177  assert( !it || !utils::is_digit( *it ));
178 
179  // Read reference base.
180  next_field_( it );
181  auto const rb = utils::to_upper( *it );
182  if( rb != 'A' && rb != 'C' && rb != 'G' && rb != 'T' && rb != 'N' ) {
183  throw std::runtime_error(
184  "Malformed pileup " + it.source_name() + " at " + it.at() +
185  ": Invalid reference base that is not in [ACGTN]"
186  );
187  }
188  record.reference_base = rb;
189  ++it;
190 
191  // Read the samples. We switch once for the first line, and thereafter check that we read the
192  // same number of samples each time.
193  if( record.samples.empty() ) {
194  size_t src_index = 0;
195  while( it && *it != '\n' ) {
196  if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
197  record.samples.emplace_back();
198  process_sample_( it, record, record.samples.back() );
199  } else {
200  skip_sample_( it );
201  }
202  ++src_index;
203  }
204  } else {
205  // Here we need two indices, one over the samples in the file (source),
206  // and one for the samples that we are writing in our Record (destination).
207  size_t src_index = 0;
208  size_t dst_index = 0;
209  while( it && *it != '\n' ) {
210  if( dst_index >= record.samples.size() ) {
211  throw std::runtime_error(
212  "Malformed pileup " + it.source_name() + " at " + it.at() +
213  ": Line with different number of samples."
214  );
215  }
216  if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
217  assert( dst_index < record.samples.size() );
218  process_sample_( it, record, record.samples[dst_index] );
219  ++dst_index;
220  } else {
221  skip_sample_( it );
222  }
223  ++src_index;
224  }
225  if( dst_index != record.samples.size() ) {
226  throw std::runtime_error(
227  "Malformed pileup " + it.source_name() + " at " + it.at() +
228  ": Line with different number of samples."
229  );
230  }
231  }
232 
233  assert( !it || *it == '\n' );
234  ++it;
235  return true;
236 }
237 
238 // -------------------------------------------------------------------------
239 // Process Sample
240 // -------------------------------------------------------------------------
241 
242 void SimplePileupReader::process_sample_(
243  utils::InputStream& input_stream,
244  Record& record,
245  Sample& sample
246 ) const {
247  // Shorthand.
248  auto& it = input_stream;
249 
250  // Reset the sample.
251  sample = Sample();
252 
253  // Read the total read count / coverage.
254  next_field_( it );
255  sample.read_coverage = utils::parse_unsigned_integer<size_t>( it );
256  assert( !it || !utils::is_digit( *it ));
257 
258  // Read the nucleotides, skipping everything that we don't want. We need to store these
259  // in a string first, as we want to do quality checks.
260  next_field_( it );
261  sample.read_bases.reserve( sample.read_coverage );
262  while( it && utils::is_graph( *it )) {
263  auto const c = *it;
264  switch( c ) {
265  case '+':
266  case '-': {
267  // A sequence matching `[+-][0-9]+[ACGTNacgtn]+` is an insertion or deletion.
268  // We skip/ignore those.
269 
270  // We first here allowed degenerated chars, and made a buffer for the codes
271  // that are allowed in indels - like this:
272  // static const std::string allowed_codes = sequence::nucleic_acid_codes_all_letters();
273 
274  // But later, we changed this to use the the proper pileup definition here,
275  // see http://www.htslib.org/doc/samtools-mpileup.html
276  static const std::string allowed_codes = "ACGTN*#";
277 
278  // First, we need to get how many chars there are in this indel.
279  ++it;
280  size_t const indel_cnt = utils::parse_unsigned_integer<size_t>( it );
281 
282  // Then, we skip that many chars, making sure that all is in order.
283  for( size_t i = 0; i < indel_cnt; ++i ) {
284  if( !it || !std::strchr( allowed_codes.c_str(), utils::to_upper( *it ))) {
285  throw std::runtime_error(
286  "Malformed pileup " + it.source_name() + " at " + it.at() +
287  ": Line with invalid indel character " + utils::char_to_hex( *it )
288  );
289  }
290  ++it;
291  }
292  break;
293  }
294  case '^': {
295  // Caret marks the start of a read segment, followed by a char for the mapping
296  // quality. We skip both of these.
297  ++it;
298  if( !it ) {
299  throw std::runtime_error(
300  "Malformed pileup " + it.source_name() + " at " + it.at() +
301  ": Line with invalid start of read segment marker"
302  );
303  }
304  ++it;
305  break;
306  }
307  case '$': {
308  // Dollar marks the end of a read segment. Skip.
309  ++it;
310  break;
311  }
312  case '.': {
313  sample.read_bases += utils::to_upper( record.reference_base );
314  ++it;
315  break;
316  }
317  case ',': {
318  sample.read_bases += utils::to_lower( record.reference_base );
319  ++it;
320  break;
321  }
322  default: {
323  sample.read_bases += c;
324  ++it;
325  break;
326  }
327  }
328  }
329  assert( !it || !utils::is_graph( *it ) );
330 
331  // Now read the quality codes, if present.
332  if( with_quality_string_ ) {
333  next_field_( it );
334  sample.phred_scores.reserve( sample.read_coverage );
335  while( it && utils::is_graph( *it )) {
336  sample.phred_scores.push_back( sequence::quality_decode_to_phred_score(
337  *it, quality_encoding_
338  ));
339  ++it;
340  }
341  assert( !it || !utils::is_graph( *it ) );
342 
343  // Version that processes the whole string at once. Not much time saved, as we need
344  // to allocate the string first. Maybe refine later for speed.
345  // std::string qual;
346  // qual.reserve( sample.read_coverage );
347  // while( it && utils::is_graph( *it )) {
348  // qual += *it;
349  // ++it;
350  // }
351  // sample.phred_scores = sequence::quality_decode_to_phred_score( qual, quality_encoding_ );
352 
353  if( sample.read_bases.size() != sample.phred_scores.size() ) {
354  throw std::runtime_error(
355  "Malformed pileup " + it.source_name() + " at " + it.at() +
356  ": Line contains " + std::to_string( sample.read_bases.size() ) + " bases, but " +
357  std::to_string( sample.phred_scores.size() ) + " quality score codes."
358  );
359  }
360  }
361  assert( sample.phred_scores.empty() || sample.read_bases.size() == sample.phred_scores.size() );
362  assert( !it || !utils::is_graph( *it ) );
363 
364  // Also check if we want to read the ancestral base, if present.
365  if( with_ancestral_base_ ) {
366  next_field_( it );
367  // We can simply read in the char here. Even if the iterator is at its end, it will
368  // simply return a null char, which will trigger the subsequent error check.
369  char const c = utils::to_upper( *it );
370  if( !it || ( c != 'A' && c != 'C' && c != 'G' && c != 'T' && c != 'N' )) {
371  throw std::runtime_error(
372  "Malformed pileup " + it.source_name() + " at " + it.at() +
373  ": Expecting ancestral base character in [ACGTN]."
374  );
375  }
376  sample.ancestral_base = c;
377  ++it;
378  }
379 
380  // Final file sanity checks.
381  if( it && !( utils::is_blank( *it ) || utils::is_newline( *it ))) {
382  throw std::runtime_error(
383  "Malformed pileup " + it.source_name() + " at " + it.at() +
384  ": Invalid characters."
385  );
386  }
387 }
388 
389 // -------------------------------------------------------------------------
390 // Skip Sample
391 // -------------------------------------------------------------------------
392 
393 void SimplePileupReader::skip_sample_(
394  utils::InputStream& input_stream
395 ) const {
396  // Shorthand.
397  auto& it = input_stream;
398 
399  // Read the total read count / coverage.
400  next_field_( it );
402  assert( !it || !utils::is_digit( *it ));
403 
404  // Read the nucleotides.
405  next_field_( it );
407  assert( !it || !utils::is_graph( *it ) );
408 
409  // Read the quality codes, if present.
410  if( with_quality_string_ ) {
411  next_field_( it );
413  }
414  assert( !it || !utils::is_graph( *it ) );
415 
416  // Read the ancestral base, if present.
417  if( with_ancestral_base_ ) {
418  next_field_( it );
420  }
421  assert( !it || !utils::is_graph( *it ) );
422 
423  // Final file sanity checks.
424  if( it && !( utils::is_blank( *it ) || utils::is_newline( *it ))) {
425  throw std::runtime_error(
426  "Malformed pileup " + it.source_name() + " at " + it.at() +
427  ": Invalid characters."
428  );
429  }
430 }
431 
432 // -------------------------------------------------------------------------
433 // Next Field
434 // -------------------------------------------------------------------------
435 
436 void SimplePileupReader::next_field_( utils::InputStream& input_stream ) const
437 {
438  // There needs to be at last some whitespace that separates the field. Affirm that,
439  // then skip it until we are at the content of the next field.
441  utils::skip_while( input_stream, utils::is_blank );
442  assert( !input_stream || !utils::is_blank( *input_stream ));
443 }
444 
445 } // namespace population
446 } // namespace genesis
genesis::sequence::quality_decode_to_phred_score
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score.
Definition: quality.cpp:193
genesis::utils::is_graph
constexpr bool is_graph(char c) noexcept
Return whether a char is a character with graphical representation, according to isgraph of the cctyp...
Definition: char.hpp:163
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:80
parser.hpp
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
genesis::population::SimplePileupReader::read
std::vector< Record > read(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line.
Definition: simple_pileup_reader.cpp:51
genesis::utils::affirm_char_or_throw
void affirm_char_or_throw(InputStream &source, char criterion, SkipWhitespace skip_ws=SkipWhitespace::kNone)
Lexing function that checks whether the current char from the stream equals the provided one.
Definition: scanner.hpp:385
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:227
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:141
genesis::utils::is_newline
constexpr bool is_newline(char c) noexcept
Return whether a char is either a new line or a carriage return character.
Definition: char.hpp:179
genesis::utils::skip_while
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
Definition: scanner.hpp:153
simple_pileup_reader.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
char.hpp
genesis::population::SimplePileupReader::parse_line
bool parse_line(utils::InputStream &input_stream, Record &record) const
Read an (m)pileup line.
Definition: simple_pileup_reader.cpp:95
genesis::population::SimplePileupReader::make_sample_filter
static std::vector< bool > make_sample_filter(std::vector< size_t > const &indices)
Helper function to create a sample filter from a list of sample indices.
Definition: simple_pileup_reader.cpp:114
genesis::population::to_string
std::string to_string(GenomeRegion const &region)
Definition: functions/genome_region.cpp:55
genesis::utils::is_blank
constexpr bool is_blank(char c) noexcept
Return whether a char is either a space or a tab character.
Definition: char.hpp:171
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
scanner.hpp
genesis::utils::to_lower
constexpr char to_lower(char c) noexcept
Return the lower case version of a letter, ASCII-only.
Definition: char.hpp:218
codes.hpp