A library for working with phylogenetic and population genetic data.
v0.27.0
simple_pileup_reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FORMATS_SIMPLE_PILEUP_READER_H_
2 #define GENESIS_POPULATION_FORMATS_SIMPLE_PILEUP_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
38 
39 #include <string>
40 #include <vector>
41 
42 namespace genesis {
43 namespace population {
44 
45 // =================================================================================================
46 // Simple (m)pileup Reader
47 // =================================================================================================
48 
78 {
79 public:
80 
81  // -------------------------------------------------------------------------
82  // Typedefs and Enums
83  // -------------------------------------------------------------------------
84 
102  struct Sample
103  {
116  size_t read_coverage = 0;
117 
125  std::string read_bases;
126 
134  std::vector<unsigned char> phred_scores;
135 
141  char ancestral_base = '\0';
142  };
143 
151  struct Record
152  {
153  std::string chromosome = "";
154  size_t position = 0;
155  char reference_base = 'N';
156  std::vector<Sample> samples;
157  };
158 
160 
161  // -------------------------------------------------------------------------
162  // Constructors and Rule of Five
163  // -------------------------------------------------------------------------
164 
165  SimplePileupReader() = default;
166  ~SimplePileupReader() = default;
167 
168  SimplePileupReader( self_type const& ) = default;
169  SimplePileupReader( self_type&& ) = default;
170 
171  self_type& operator= ( self_type const& ) = default;
172  self_type& operator= ( self_type&& ) = default;
173 
174  // ---------------------------------------------------------------------
175  // Reading Records
176  // ---------------------------------------------------------------------
177 
181  std::vector<Record> read_records( std::shared_ptr< utils::BaseInputSource > source ) const;
182 
183  // /* *
184  // * @brief Read an (m)pileup file line by line, but only the samples at the given indices,
185  // * as pileup Record%s.
186  // */
187  // std::vector<Record> read_records(
188  // std::shared_ptr< utils::BaseInputSource > source,
189  // std::vector<size_t> const& sample_indices
190  // ) const;
191 
198  std::vector<Record> read_records(
199  std::shared_ptr< utils::BaseInputSource > source,
200  std::vector<bool> const& sample_filter
201  ) const;
202 
203  // ---------------------------------------------------------------------
204  // Reading Variants
205  // ---------------------------------------------------------------------
206 
210  std::vector<Variant> read_variants( std::shared_ptr< utils::BaseInputSource > source ) const;
211 
212  // /* *
213  // * @brief Read an (m)pileup file line by line, but only the samples at the given indices,
214  // * as Variant%s.
215  // */
216  // std::vector<Variant> read_variants(
217  // std::shared_ptr< utils::BaseInputSource > source,
218  // std::vector<size_t> const& sample_indices
219  // ) const;
220 
227  std::vector<Variant> read_variants(
228  std::shared_ptr< utils::BaseInputSource > source,
229  std::vector<bool> const& sample_filter
230  ) const;
231 
232  // -------------------------------------------------------------------------
233  // Parsing Records
234  // -------------------------------------------------------------------------
235 
244  bool parse_line_record(
245  utils::InputStream& input_stream,
246  Record& record
247  ) const;
248 
257  bool parse_line_record(
258  utils::InputStream& input_stream,
259  Record& record,
260  std::vector<bool> const& sample_filter
261  ) const;
262 
263  // -------------------------------------------------------------------------
264  // Parsing Variants
265  // -------------------------------------------------------------------------
266 
272  bool parse_line_variant(
273  utils::InputStream& input_stream,
274  Variant& variant
275  ) const;
276 
285  bool parse_line_variant(
286  utils::InputStream& input_stream,
287  Variant& variant,
288  std::vector<bool> const& sample_filter
289  ) const;
290 
291  // -------------------------------------------------------------------------
292  // General Settings
293  // -------------------------------------------------------------------------
294 
295  bool strict_bases() const
296  {
297  return strict_bases_;
298  }
299 
306  self_type& strict_bases( bool value )
307  {
308  strict_bases_ = value;
309  return *this;
310  }
311 
312  bool with_quality_string() const
313  {
314  return with_quality_string_;
315  }
316 
333  {
334  with_quality_string_ = value;
335  return *this;
336  }
337 
339  {
340  return quality_encoding_;
341  }
342 
351  {
352  quality_encoding_ = value;
353  return *this;
354  }
355 
356  bool with_ancestral_base() const
357  {
358  return with_ancestral_base_;
359  }
360 
377  {
378  with_ancestral_base_ = value;
379  return *this;
380  }
381 
382  // -------------------------------------------------------------------------
383  // Variant Settings
384  // -------------------------------------------------------------------------
385 
392  size_t min_base_quality() const
393  {
394  return min_base_quality_;
395  }
396 
407  self_type& min_base_quality( size_t value )
408  {
409  min_base_quality_ = value;
410  return *this;
411  }
412 
413  // -------------------------------------------------------------------------
414  // Internal Members
415  // -------------------------------------------------------------------------
416 
417 private:
418 
423  template<class T>
424  bool parse_line_(
425  utils::InputStream& input_stream,
426  T& target,
427  std::vector<bool> const& sample_filter,
428  bool use_sample_filter
429  ) const;
430 
435  template<class T, class S>
436  void process_sample_(
437  utils::InputStream& input_stream,
438  T const& target,
439  S& sample
440  ) const;
441 
442  template<class T>
443  void set_target_alternative_base_(
444  T& target
445  ) const;
446 
447  template<class S>
448  void set_sample_read_coverage_(
449  size_t read_coverage,
450  S& sample
451  ) const;
452 
453  template<class S>
454  void set_sample_read_bases_(
455  std::string const& read_bases,
456  S& sample
457  ) const;
458 
459  template<class S>
460  void process_quality_string_(
461  utils::InputStream& input_stream,
462  S& sample
463  ) const;
464 
465  void tally_base_(
466  utils::InputStream& input_stream,
467  BaseCounts& base_count,
468  char b
469  ) const;
470 
471  template<class S>
472  void process_ancestral_base_(
473  utils::InputStream& input_stream,
474  S& sample
475  ) const;
476 
477  void skip_sample_(
478  utils::InputStream& input_stream
479  ) const;
480 
481  void next_field_(
482  utils::InputStream& input_stream
483  ) const;
484 
485  // -------------------------------------------------------------------------
486  // Data Members
487  // -------------------------------------------------------------------------
488 
489 private:
490 
491  // If set, we expect bases to be ACGTN. If not set, we will fix any that are not to N.
492  bool strict_bases_ = false;
493 
494  // Set whether the file contains the base quality score column, and if so, how its encoded
495  // (we default to Sanger with offset 33), and if we want to skip low quality bases.
496  bool with_quality_string_ = true;
498  size_t min_base_quality_ = 0;
499 
500  // Set whether the last part of the sample line contains the base of the ancestral allele.
501  bool with_ancestral_base_ = false;
502 
503  // Internal buffer to read bases into. Used for speedup to avoid reallocations.
504  mutable std::string base_buffer_;
505 };
506 
507 } // namespace population
508 } // namespace genesis
509 
510 #endif // include guard
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::population::SimplePileupReader::parse_line_variant
bool parse_line_variant(utils::InputStream &input_stream, Variant &variant) const
Read an (m)pileup line, as a Variant.
Definition: simple_pileup_reader.cpp:208
genesis::population::SimplePileupReader
Reader for line-by-line assessment of (m)pileup files.
Definition: simple_pileup_reader.hpp:77
genesis::population::SimplePileupReader::Record::position
size_t position
Definition: simple_pileup_reader.hpp:154
genesis::population::SimplePileupReader::min_base_quality
self_type & min_base_quality(size_t value)
Set the minimum phred quality score that a base needs to have to be added to the Variant BaseCounts f...
Definition: simple_pileup_reader.hpp:407
genesis::population::SimplePileupReader::read_variants
std::vector< Variant > read_variants(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as Variants.
Definition: simple_pileup_reader.cpp:129
genesis::sequence::QualityEncoding::kSanger
@ kSanger
genesis::population::SimplePileupReader::Sample::read_coverage
size_t read_coverage
Total count of reads covering this position.
Definition: simple_pileup_reader.hpp:116
genesis::population::SimplePileupReader::quality_encoding
self_type & quality_encoding(sequence::QualityEncoding value)
Set the type of encoding for the quality code string.
Definition: simple_pileup_reader.hpp:350
input_source.hpp
input_stream.hpp
genesis::population::SimplePileupReader::Sample
One sample in a pileup line/record.
Definition: simple_pileup_reader.hpp:102
genesis::population::SimplePileupReader::~SimplePileupReader
~SimplePileupReader()=default
genesis::population::SimplePileupReader::Record
Single line/record from a pileup file.
Definition: simple_pileup_reader.hpp:151
genesis::population::SimplePileupReader::Record::reference_base
char reference_base
Definition: simple_pileup_reader.hpp:155
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::population::SimplePileupReader::min_base_quality
size_t min_base_quality() const
Get the currently set minimum phred quality score that a base needs to have to be added to the Varian...
Definition: simple_pileup_reader.hpp:392
genesis::population::Variant
A single variant at a position in a chromosome, along with BaseCounts for a set of samples.
Definition: variant.hpp:62
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::SimplePileupReader::Sample::read_bases
std::string read_bases
All bases (expect for indels) of the reads that cover the given position.
Definition: simple_pileup_reader.hpp:125
genesis::population::SimplePileupReader::strict_bases
bool strict_bases() const
Definition: simple_pileup_reader.hpp:295
genesis::population::BaseCounts
One set of nucleotide base counts, for example for a given sample that represents a pool of sequenced...
Definition: base_counts.hpp:54
genesis::population::SimplePileupReader::with_quality_string
self_type & with_quality_string(bool value)
Set whether to expect a phred-scaled, ASCII-encoded quality code string per sample.
Definition: simple_pileup_reader.hpp:332
genesis::population::SimplePileupReader::Record::chromosome
std::string chromosome
Definition: simple_pileup_reader.hpp:153
genesis::population::SimplePileupReader::with_ancestral_base
self_type & with_ancestral_base(bool value)
Set whether to expect the base of the ancestral allele as the last part of each sample in a record li...
Definition: simple_pileup_reader.hpp:376
genesis::population::SimplePileupReader::strict_bases
self_type & strict_bases(bool value)
Set whether to strictly require bases to be in ACGTN.
Definition: simple_pileup_reader.hpp:306
genesis::population::SimplePileupReader::parse_line_record
bool parse_line_record(utils::InputStream &input_stream, Record &record) const
Read an (m)pileup line, as a Record.
Definition: simple_pileup_reader.cpp:189
genesis::population::SimplePileupReader::Sample::ancestral_base
char ancestral_base
Base of the ancestral allele.
Definition: simple_pileup_reader.hpp:141
genesis::population::SimplePileupReader::operator=
self_type & operator=(self_type const &)=default
genesis::population::SimplePileupReader::Record::samples
std::vector< Sample > samples
Definition: simple_pileup_reader.hpp:156
genesis::population::SimplePileupReader::with_ancestral_base
bool with_ancestral_base() const
Definition: simple_pileup_reader.hpp:356
variant.hpp
genesis::population::SimplePileupReader::SimplePileupReader
SimplePileupReader()=default
genesis::population::SimplePileupReader::Sample::phred_scores
std::vector< unsigned char > phred_scores
Phread-scaled scores of the bases as given in read_bases.
Definition: simple_pileup_reader.hpp:134
genesis::population::SimplePileupReader::quality_encoding
sequence::QualityEncoding quality_encoding() const
Definition: simple_pileup_reader.hpp:338
genesis::population::SimplePileupReader::read_records
std::vector< Record > read_records(std::shared_ptr< utils::BaseInputSource > source) const
Read an (m)pileup file line by line, as pileup Records.
Definition: simple_pileup_reader.cpp:69
quality.hpp
genesis::population::SimplePileupReader::with_quality_string
bool with_quality_string() const
Definition: simple_pileup_reader.hpp:312