A library for working with phylogenetic and population genetic data.
v0.32.0
simple_pileup_reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMAT_SIMPLE_PILEUP_READER_H_
2 #define GENESIS_POPULATION_FORMAT_SIMPLE_PILEUP_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
38 
39 #include <array>
40 #include <string>
41 #include <vector>
42 
43 namespace genesis {
44 namespace population {
45 
46 // =================================================================================================
47 // Simple (m)pileup Reader
48 // =================================================================================================
49 
79 {
80 public:
81 
82  // -------------------------------------------------------------------------
83  // Typedefs and Enums
84  // -------------------------------------------------------------------------
85 
103  struct Sample
104  {
117  size_t read_depth = 0;
118 
126  std::string read_bases;
127 
135  std::vector<unsigned char> phred_scores;
136 
142  char ancestral_base = '\0';
143  };
144 
152  struct Record
153  {
154  std::string chromosome = "";
155  size_t position = 0;
156  char reference_base = 'N';
157  std::vector<Sample> samples;
158  };
159 
161 
162  // -------------------------------------------------------------------------
163  // Constructors and Rule of Five
164  // -------------------------------------------------------------------------
165 
166  SimplePileupReader() = default;
167  ~SimplePileupReader() = default;
168 
169  SimplePileupReader( self_type const& ) = default;
170  SimplePileupReader( self_type&& ) = default;
171 
172  self_type& operator= ( self_type const& ) = default;
173  self_type& operator= ( self_type&& ) = default;
174 
175  // ---------------------------------------------------------------------
176  // Reading Records
177  // ---------------------------------------------------------------------
178 
182  std::vector<Record> read_records( std::shared_ptr< utils::BaseInputSource > source ) const;
183 
184  // /* *
185  // * @brief Read an (m)pileup file line by line, but only the samples at the given indices,
186  // * as pileup Record%s.
187  // */
188  // std::vector<Record> read_records(
189  // std::shared_ptr< utils::BaseInputSource > source,
190  // std::vector<size_t> const& sample_indices
191  // ) const;
192 
199  std::vector<Record> read_records(
200  std::shared_ptr< utils::BaseInputSource > source,
201  std::vector<bool> const& sample_filter
202  ) const;
203 
204  // ---------------------------------------------------------------------
205  // Reading Variants
206  // ---------------------------------------------------------------------
207 
211  std::vector<Variant> read_variants( std::shared_ptr< utils::BaseInputSource > source ) const;
212 
213  // /* *
214  // * @brief Read an (m)pileup file line by line, but only the samples at the given indices,
215  // * as Variant%s.
216  // */
217  // std::vector<Variant> read_variants(
218  // std::shared_ptr< utils::BaseInputSource > source,
219  // std::vector<size_t> const& sample_indices
220  // ) const;
221 
228  std::vector<Variant> read_variants(
229  std::shared_ptr< utils::BaseInputSource > source,
230  std::vector<bool> const& sample_filter
231  ) const;
232 
233  // -------------------------------------------------------------------------
234  // Parsing Records
235  // -------------------------------------------------------------------------
236 
245  bool parse_line_record(
246  utils::InputStream& input_stream,
247  Record& record
248  ) const;
249 
258  bool parse_line_record(
259  utils::InputStream& input_stream,
260  Record& record,
261  std::vector<bool> const& sample_filter
262  ) const;
263 
264  // -------------------------------------------------------------------------
265  // Parsing Variants
266  // -------------------------------------------------------------------------
267 
273  bool parse_line_variant(
274  utils::InputStream& input_stream,
275  Variant& variant
276  ) const;
277 
286  bool parse_line_variant(
287  utils::InputStream& input_stream,
288  Variant& variant,
289  std::vector<bool> const& sample_filter
290  ) const;
291 
292  // -------------------------------------------------------------------------
293  // General Settings
294  // -------------------------------------------------------------------------
295 
296  bool strict_bases() const
297  {
298  return strict_bases_;
299  }
300 
307  self_type& strict_bases( bool value )
308  {
309  strict_bases_ = value;
310  return *this;
311  }
312 
313  bool with_quality_string() const
314  {
315  return with_quality_string_;
316  }
317 
334  {
335  with_quality_string_ = value;
336  return *this;
337  }
338 
340  {
341  return quality_encoding_;
342  }
343 
352  {
353  quality_encoding_ = value;
354  return *this;
355  }
356 
367  std::array<size_t, 128> const& quality_code_counts() const
368  {
369  return quality_code_counts_;
370  }
371 
372  bool with_ancestral_base() const
373  {
374  return with_ancestral_base_;
375  }
376 
393  {
394  with_ancestral_base_ = value;
395  return *this;
396  }
397 
398  // -------------------------------------------------------------------------
399  // Variant Settings
400  // -------------------------------------------------------------------------
401 
408  size_t min_base_quality() const
409  {
410  return min_base_quality_;
411  }
412 
423  self_type& min_base_quality( size_t value )
424  {
425  min_base_quality_ = value;
426  return *this;
427  }
428 
429  // -------------------------------------------------------------------------
430  // Internal Members
431  // -------------------------------------------------------------------------
432 
433 private:
434 
438  void reset_status_( Variant& variant ) const;
439 
444  template<class T>
445  bool parse_line_(
446  utils::InputStream& input_stream,
447  T& target,
448  std::vector<bool> const& sample_filter,
449  bool use_sample_filter
450  ) const;
451 
456  template<class T, class S>
457  void process_sample_(
458  utils::InputStream& input_stream,
459  T const& target,
460  S& sample
461  ) const;
462 
463  template<class T>
464  void set_target_alternative_base_(
465  T& target
466  ) const;
467 
468  template<class S>
469  void set_sample_read_depth_(
470  size_t read_depth,
471  S& sample
472  ) const;
473 
474  bool process_sample_read_bases_buffer_(
475  utils::InputStream& input_stream,
476  char reference_base
477  ) const;
478 
479  void process_sample_read_bases_stream_(
480  utils::InputStream& input_stream,
481  char reference_base
482  ) const;
483 
484  template<class S>
485  void set_sample_read_bases_(
486  std::string const& read_bases,
487  S& sample
488  ) const;
489 
490  template<class S>
491  void process_quality_string_(
492  utils::InputStream& input_stream,
493  S& sample
494  ) const;
495 
496  void tally_base_(
497  utils::InputStream& input_stream,
498  SampleCounts& sample,
499  char b
500  ) const;
501 
502  template<class S>
503  void process_ancestral_base_(
504  utils::InputStream& input_stream,
505  S& sample
506  ) const;
507 
508  void skip_sample_(
509  utils::InputStream& input_stream
510  ) const;
511 
512  void next_field_(
513  utils::InputStream& input_stream
514  ) const;
515 
516  // -------------------------------------------------------------------------
517  // Data Members
518  // -------------------------------------------------------------------------
519 
520 private:
521 
522  // If set, we expect bases to be ACGTN. If not set, we will fix any that are not to N.
523  bool strict_bases_ = false;
524 
525  // Set whether the file contains the base quality score column, and if so, how its encoded
526  // (we default to Sanger with offset 33), and if we want to skip low quality bases.
527  bool with_quality_string_ = true;
529  size_t min_base_quality_ = 0;
530 
531  // We also keep track of the base codes found, to check that we have the right encoding.
532  mutable std::array<size_t, 128> quality_code_counts_;
533 
534  // Set whether the last part of the sample line contains the base of the ancestral allele.
535  bool with_ancestral_base_ = false;
536 
537  // Internal buffer to read bases into. Used for speedup to avoid reallocations.
538  mutable std::string base_buffer_;
539 };
540 
541 } // namespace population
542 } // namespace genesis
543 
544 #endif // include guard
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::population::SimplePileupReader::parse_line_variant
bool parse_line_variant(utils::InputStream &input_stream, Variant &variant) const
Read an (m)pileup line, as a Variant.
Definition: simple_pileup_reader.cpp:181
genesis::population::SampleCounts
One set of nucleotide sample counts, for example for a given sample that represents a pool of sequenc...
Definition: sample_counts.hpp:56
genesis::population::SimplePileupReader
Reader for line-by-line assessment of (m)pileup files.
Definition: simple_pileup_reader.hpp:78
genesis::population::SimplePileupReader::Record::position
size_t position
Definition: simple_pileup_reader.hpp:155
genesis::population::SimplePileupReader::min_base_quality
self_type & min_base_quality(size_t value)
Set the minimum phred quality score that a base needs to have to be added to the Variant SampleCounts...
Definition: simple_pileup_reader.hpp:423
genesis::population::SimplePileupReader::Sample::read_depth
size_t read_depth
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:117
genesis::population::SimplePileupReader::read_variants
std::vector< Variant > read_variants(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as Variants.
Definition: simple_pileup_reader.cpp:110
genesis::sequence::QualityEncoding::kSanger
@ kSanger
genesis::population::SimplePileupReader::quality_encoding
self_type & quality_encoding(sequence::QualityEncoding value)
Set the type of encoding for the quality code string.
Definition: simple_pileup_reader.hpp:351
input_source.hpp
input_stream.hpp
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:103
genesis::population::SimplePileupReader::~SimplePileupReader
~SimplePileupReader()=default
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:152
genesis::population::SimplePileupReader::Record::reference_base
char reference_base
Definition: simple_pileup_reader.hpp:156
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::population::SimplePileupReader::min_base_quality
size_t min_base_quality() const
Get the currently set minimum phred quality score that a base needs to have to be added to the Varian...
Definition: simple_pileup_reader.hpp:408
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:126
genesis::population::SimplePileupReader::strict_bases
bool strict_bases() const
Definition: simple_pileup_reader.hpp:296
genesis::population::SimplePileupReader::with_quality_string
self_type & with_quality_string(bool value)
Set whether to expect a phred-scaled, ASCII-encoded quality code string per sample.
Definition: simple_pileup_reader.hpp:333
genesis::population::SimplePileupReader::Record::chromosome
std::string chromosome
Definition: simple_pileup_reader.hpp:154
genesis::population::SimplePileupReader::quality_code_counts
std::array< size_t, 128 > const & quality_code_counts() const
Return the counts for all quality base codes found so far when parsing an input.
Definition: simple_pileup_reader.hpp:367
genesis::population::SimplePileupReader::with_ancestral_base
self_type & with_ancestral_base(bool value)
Set whether to expect the base of the ancestral allele as the last part of each sample in a record li...
Definition: simple_pileup_reader.hpp:392
genesis::population::SimplePileupReader::strict_bases
self_type & strict_bases(bool value)
Set whether to strictly require bases to be in ACGTN.
Definition: simple_pileup_reader.hpp:307
genesis::population::SimplePileupReader::parse_line_record
bool parse_line_record(utils::InputStream &input_stream, Record &record) const
Read an (m)pileup line, as a Record.
Definition: simple_pileup_reader.cpp:162
genesis::population::SimplePileupReader::Sample::ancestral_base
char ancestral_base
Base of the ancestral allele.
Definition: simple_pileup_reader.hpp:142
genesis::population::SimplePileupReader::operator=
self_type & operator=(self_type const &)=default
genesis::population::SimplePileupReader::Record::samples
std::vector< Sample > samples
Definition: simple_pileup_reader.hpp:157
genesis::population::SimplePileupReader::with_ancestral_base
bool with_ancestral_base() const
Definition: simple_pileup_reader.hpp:372
variant.hpp
genesis::population::SimplePileupReader::SimplePileupReader
SimplePileupReader()=default
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:135
genesis::population::SimplePileupReader::quality_encoding
sequence::QualityEncoding quality_encoding() const
Definition: simple_pileup_reader.hpp:339
genesis::population::SimplePileupReader::read_records
std::vector< Record > read_records(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as pileup Records.
Definition: simple_pileup_reader.cpp:52
quality.hpp
genesis::population::SimplePileupReader::with_quality_string
bool with_quality_string() const
Definition: simple_pileup_reader.hpp:313