A toolkit for working with phylogenetic data.
v0.24.0
fastq_reader.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_FORMATS_FASTQ_READER_H_
2 #define GENESIS_SEQUENCE_FORMATS_FASTQ_READER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2020 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
38 
39 #include <functional>
40 #include <iosfwd>
41 #include <memory>
42 #include <string>
43 
44 namespace genesis {
45 
46 // =================================================================================================
47 // Forward Declarations
48 // =================================================================================================
49 
50 namespace utils {
51  class InputStream;
52 }
53 
54 namespace sequence {
55  class SequenceSet;
56  class Sequence;
57 }
58 
59 // =================================================================================================
60 // Fastq Reader
61 // =================================================================================================
62 
63 namespace sequence {
64 
150 {
151 public:
152 
153  // ---------------------------------------------------------------------
154  // Typedefs and Enums
155  // ---------------------------------------------------------------------
156 
170  using quality_string_function = std::function< void(
171  std::string const& quality_string, Sequence& sequence
172  ) >;
173 
177  enum class SiteCasing
178  {
182  kUnchanged,
183 
187  kToUpper,
188 
192  kToLower
193  };
194 
195  // ---------------------------------------------------------------------
196  // Constructor and Rule of Five
197  // ---------------------------------------------------------------------
198 
208  FastqReader();
209  ~FastqReader() = default;
210 
211  FastqReader( FastqReader const& ) = default;
212  FastqReader( FastqReader&& ) = default;
213 
214  FastqReader& operator= ( FastqReader const& ) = default;
215  FastqReader& operator= ( FastqReader&& ) = default;
216 
217  // ---------------------------------------------------------------------
218  // Reading
219  // ---------------------------------------------------------------------
220 
228  SequenceSet read( std::shared_ptr< utils::BaseInputSource > source ) const;
229 
240  void read( std::shared_ptr< utils::BaseInputSource > source, SequenceSet& sequence_set ) const;
241 
242  // ---------------------------------------------------------------------
243  // Parsing
244  // ---------------------------------------------------------------------
245 
252  void parse_document(
253  utils::InputStream& input_stream,
254  SequenceSet& sequence_set
255  ) const;
256 
270  bool parse_sequence(
271  utils::InputStream& input_stream,
272  Sequence& sequence
273  ) const;
274 
275 protected:
276 
280  bool parse_sequence_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const;
281 
285  void parse_label1_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const;
286 
290  void parse_sites_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const;
291 
295  void parse_label2_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const;
296 
300  void parse_quality_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const;
301 
302  // ---------------------------------------------------------------------
303  // Properties
304  // ---------------------------------------------------------------------
305 
306 public:
307 
315  FastqReader& site_casing( SiteCasing value );
316 
320  SiteCasing site_casing() const;
321 
339  FastqReader& valid_chars( std::string const& chars );
340 
346  std::string valid_chars() const;
347 
354  utils::CharLookup<bool>& valid_char_lookup();
355 
361  FastqReader& quality_encoding( QualityEncoding encoding );
362 
367  QualityEncoding quality_encoding();
368 
374  FastqReader& quality_string_plugin( quality_string_function const& plugin );
375 
376  // ---------------------------------------------------------------------
377  // Members
378  // ---------------------------------------------------------------------
379 
380 private:
381 
382  SiteCasing site_casing_ = SiteCasing::kToUpper;
383  bool use_validation_ = false;
384  utils::CharLookup<bool> lookup_;
385 
386  QualityEncoding quality_encoding_ = QualityEncoding::kSanger;
387 
388  // Functional that can be set to process the quality string found in fastq files.
389  quality_string_function quality_string_plugin_ = [&](
390  std::string const& quality_string, Sequence& sequence
391  ){
392  sequence.phred_scores( quality_decode_to_phred_score( quality_string, quality_encoding_ ));
393  };
394 
395 };
396 
397 } // namespace sequence
398 } // namespace genesis
399 
400 #endif // include guard
Read Fastq sequence data.
unsigned char quality_decode_to_phred_score(char quality_code, QualityEncoding encoding)
Decode a single quality score char (for example coming from a fastq file) to a phred score...
Definition: quality.cpp:86
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::vector< unsigned char > & phred_scores()
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Store a set of Sequences.
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:71
std::function< void(std::string const &quality_string, Sequence &sequence) > quality_string_function
Function type that allows to work with the quality line(s) in fastq files.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...