A library for working with phylogenetic and population genetic data.
v0.27.0
fastq_reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_FORMATS_FASTQ_READER_H_
2 #define GENESIS_SEQUENCE_FORMATS_FASTQ_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2020 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
38 
39 #include <functional>
40 #include <iosfwd>
41 #include <memory>
42 #include <string>
43 
44 namespace genesis {
45 
46 // =================================================================================================
47 // Forward Declarations
48 // =================================================================================================
49 
50 namespace utils {
51  class InputStream;
52 }
53 
54 namespace sequence {
55  class SequenceSet;
56  class Sequence;
57 }
58 
59 // =================================================================================================
60 // Fastq Reader
61 // =================================================================================================
62 
63 namespace sequence {
64 
150 {
151 public:
152 
153  // ---------------------------------------------------------------------
154  // Typedefs and Enums
155  // ---------------------------------------------------------------------
156 
170  using quality_string_function = std::function< void(
171  std::string const& quality_string, Sequence& sequence
172  ) >;
173 
177  enum class SiteCasing
178  {
182  kUnchanged,
183 
187  kToUpper,
188 
192  kToLower
193  };
194 
195  // ---------------------------------------------------------------------
196  // Constructor and Rule of Five
197  // ---------------------------------------------------------------------
198 
208  FastqReader();
209  ~FastqReader() = default;
210 
211  FastqReader( FastqReader const& ) = default;
212  FastqReader( FastqReader&& ) = default;
213 
214  FastqReader& operator= ( FastqReader const& ) = default;
215  FastqReader& operator= ( FastqReader&& ) = default;
216 
217  // ---------------------------------------------------------------------
218  // Reading
219  // ---------------------------------------------------------------------
220 
228  SequenceSet read( std::shared_ptr< utils::BaseInputSource > source ) const;
229 
240  void read( std::shared_ptr< utils::BaseInputSource > source, SequenceSet& sequence_set ) const;
241 
242  // ---------------------------------------------------------------------
243  // Parsing
244  // ---------------------------------------------------------------------
245 
252  void parse_document(
253  utils::InputStream& input_stream,
254  SequenceSet& sequence_set
255  ) const;
256 
270  bool parse_sequence(
271  utils::InputStream& input_stream,
272  Sequence& sequence
273  ) const;
274 
275 protected:
276 
280  bool parse_sequence_( utils::InputStream& input_stream, Sequence& sequence ) const;
281 
285  void parse_label1_( utils::InputStream& input_stream, Sequence& sequence ) const;
286 
290  void parse_sites_( utils::InputStream& input_stream, Sequence& sequence ) const;
291 
295  void parse_label2_( utils::InputStream& input_stream, Sequence& sequence ) const;
296 
300  void parse_quality_( utils::InputStream& input_stream, Sequence& sequence ) const;
301 
302  // ---------------------------------------------------------------------
303  // Properties
304  // ---------------------------------------------------------------------
305 
306 public:
307 
316 
320  SiteCasing site_casing() const;
321 
339  FastqReader& valid_chars( std::string const& chars );
340 
346  std::string valid_chars() const;
347 
355 
362 
368 
375 
376  // ---------------------------------------------------------------------
377  // Members
378  // ---------------------------------------------------------------------
379 
380 private:
381 
382  SiteCasing site_casing_ = SiteCasing::kToUpper;
383  bool use_validation_ = false;
384  utils::CharLookup<bool> lookup_;
385 
386  QualityEncoding quality_encoding_ = QualityEncoding::kSanger;
387 
388  // Functional that can be set to process the quality string found in fastq files.
389  quality_string_function quality_string_plugin_ = [&](
390  std::string const& quality_string, Sequence& sequence
391  ){
392  sequence.phred_scores( quality_decode_to_phred_score( quality_string, quality_encoding_ ));
393  };
394 
395  // Internal reading buffer
396  mutable std::string buffer_;
397 
398 };
399 
400 } // namespace sequence
401 } // namespace genesis
402 
403 #endif // include guard
genesis::sequence::quality_decode_to_phred_score
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score.
Definition: quality.cpp:218
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::sequence::FastqReader::SiteCasing::kToLower
@ kToLower
Make all sites lower case.
genesis::sequence::FastqReader::parse_label1_
void parse_label1_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the first label line (starting with an @).
Definition: fastq_reader.cpp:135
genesis::sequence::FastqReader::valid_chars
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
Definition: fastq_reader.cpp:339
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::QualityEncoding::kSanger
@ kSanger
genesis::sequence::FastqReader::SiteCasing::kToUpper
@ kToUpper
Make all sites upper case.
genesis::sequence::FastqReader::read
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fastq format and return them as a SequenceSet.
Definition: fastq_reader.cpp:65
genesis::sequence::FastqReader
Read Fastq sequence data.
Definition: fastq_reader.hpp:149
input_source.hpp
genesis::sequence::FastqReader::parse_label2_
void parse_label2_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the second label line (starting with a +, and either empty or equal to the first).
Definition: fastq_reader.cpp:239
genesis::sequence::FastqReader::~FastqReader
~FastqReader()=default
genesis::sequence::FastqReader::SiteCasing::kUnchanged
@ kUnchanged
Do not change the case of the sites.
char_lookup.hpp
genesis::sequence::FastqReader::valid_char_lookup
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
Definition: fastq_reader.cpp:350
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::utils::CharLookup< bool >
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::FastqReader::parse_sites_
void parse_sites_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the sequence line(s).
Definition: fastq_reader.cpp:178
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:59
genesis::sequence::FastqReader::parse_document
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fastq document into a SequenceSet.
Definition: fastq_reader.cpp:85
genesis::sequence::FastqReader::parse_quality_
void parse_quality_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the quality score line(s), which also runs the plugin, if available.
Definition: fastq_reader.cpp:268
genesis::sequence::FastqReader::FastqReader
FastqReader()
Create a default FastqReader.
Definition: fastq_reader.cpp:56
genesis::sequence::FastqReader::SiteCasing
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Definition: fastq_reader.hpp:177
genesis::sequence::FastqReader::site_casing
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
Definition: fastq_reader.cpp:320
genesis::sequence::FastqReader::quality_string_plugin
FastqReader & quality_string_plugin(quality_string_function const &plugin)
Functional that can be set to process the quality string found in fastq files.
Definition: fastq_reader.cpp:366
genesis::sequence::FastqReader::operator=
FastqReader & operator=(FastqReader const &)=default
genesis::sequence::FastqReader::parse_sequence_
bool parse_sequence_(utils::InputStream &input_stream, Sequence &sequence) const
Parse a fastq sequence into the given sequence object.
Definition: fastq_reader.cpp:106
genesis::sequence::FastqReader::quality_string_function
std::function< void(std::string const &quality_string, Sequence &sequence) > quality_string_function
Function type that allows to work with the quality line(s) in fastq files.
Definition: fastq_reader.hpp:172
genesis::sequence::FastqReader::parse_sequence
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fastq format.
Definition: fastq_reader.cpp:95
genesis::sequence::FastqReader::quality_encoding
QualityEncoding quality_encoding()
Return the currently set QualityEncoding that is used for decoding the quality score line of the Fast...
Definition: fastq_reader.cpp:361
sequence.hpp
quality.hpp