A library for working with phylogenetic and population genetic data.
v0.27.0
fastq_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
41 
42 #include <cassert>
43 #include <cctype>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 #include <thread>
48 
49 namespace genesis {
50 namespace sequence {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
57 {
58  lookup_.set_all( true );
59 }
60 
61 // =================================================================================================
62 // Reading
63 // =================================================================================================
64 
65 SequenceSet FastqReader::read( std::shared_ptr< utils::BaseInputSource > source ) const
66 {
67  SequenceSet result;
68  utils::InputStream is( source );
69  parse_document( is, result );
70  return result;
71 }
72 
74  std::shared_ptr< utils::BaseInputSource > source,
75  SequenceSet& sequence_set
76 ) const {
77  utils::InputStream is( source );
78  parse_document( is, sequence_set );
79 }
80 
81 // =================================================================================================
82 // Parsing
83 // =================================================================================================
84 
86  utils::InputStream& input_stream,
87  SequenceSet& sequence_set
88 ) const {
89  Sequence tmp_seq;
90  while( parse_sequence_( input_stream, tmp_seq )) {
91  sequence_set.add( tmp_seq );
92  }
93 }
94 
96  utils::InputStream& input_stream,
97  Sequence& sequence
98 ) const {
99  return parse_sequence_( input_stream, sequence );
100 }
101 
102 // =================================================================================================
103 // Parsing Internals
104 // =================================================================================================
105 
106 bool FastqReader::parse_sequence_( utils::InputStream& input_stream, Sequence& sequence ) const
107 {
108  // Init. Call clear() in order to avoid not setting properties that might be added to
109  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
110  // should not change when setting the strings to empty strings.
111  sequence.clear();
112 
113  // Check for data. If there is nothing, stop here. If there is data, then from here on,
114  // we expect a full fastq sequence to be present - otherwise, one of the below functions throws.
115  if( !input_stream ) {
116  return false;
117  }
118 
119  // Parse all elements of a Fastq sequence.
120  // We use a shared buffer for all of these functions that is filled with data from the input
121  // stream. This has the following reasoning: Sequence files can be quite big, and appending to
122  // a string can cause its size to double whenver the capacity is reached (depending on the STL
123  // implementation). We want to avoid that, which can usually be done by making a fresh copy of
124  // the string. However, reading into a string first, and then making a copy of it, necessitates
125  // two memory allocations. We can circumvent the first by using this buffer, which is re-used.
126  // Hence, we here rely on the fact that string::clear() does not change the capacity of the buffer.
127  parse_label1_( input_stream, sequence );
128  parse_sites_( input_stream, sequence );
129  parse_label2_( input_stream, sequence );
130  parse_quality_( input_stream, sequence );
131 
132  return true;
133 }
134 
135 void FastqReader::parse_label1_( utils::InputStream& input_stream, Sequence& sequence ) const
136 {
137  auto& it = input_stream;
138 
139  // Check beginning of sequence.
140  if( !it || *it != '@' ) {
141  throw std::runtime_error(
142  "Malformed Fastq " + it.source_name()
143  + ": Expecting '@' at beginning of sequence at line "
144  + std::to_string( it.line() ) + "."
145  );
146  }
147  assert( it && *it == '@' );
148  ++it;
149 
150  // Parse label.
151  auto const label1 = utils::read_while( it, isprint );
152  if( label1 == "" ) {
153  throw std::runtime_error(
154  "Malformed Fastq " + it.source_name()
155  + ": Expecting label after '@' in sequence at line "
156  + std::to_string( it.line() ) + "."
157  );
158  }
159 
160  // TODO do the isprint check after reading the whole line, for speed reasons.
161  // TODO also, use the ascii only version, again for speed
162 
163  // Check for unexpected end of file.
164  if( !it || *it != '\n' ) {
165  throw std::runtime_error(
166  "Malformed Fastq " + it.source_name()
167  + ": Unexpected characters at the end of the label line in sequence at line "
168  + std::to_string( it.line() ) + "."
169  );
170  }
171  assert( it && *it == '\n' );
172  ++it;
173 
174  // Copy the label into the sequence, which also makes sure that we do not store extra capacity.
175  sequence.label( label1 );
176 }
177 
178 void FastqReader::parse_sites_( utils::InputStream& input_stream, Sequence& sequence ) const
179 {
180  // Some prep shorthand.
181  auto& it = input_stream;
182  buffer_.clear();
183 
184  // Check for unexpected end of file.
185  if( !it ) {
186  throw std::runtime_error(
187  "Malformed Fastq " + it.source_name()
188  + ": Expecting a sequence sites line after the first label line at line "
189  + std::to_string( it.line() - 1 ) + "."
190  );
191  }
192  assert( it );
193 
194  // Parse sequence. At every beginning of the loop, we are at a line start.
195  // Continue until we find the '+' char, which marks the beginning of the second label
196  // for the quality line(s). This is the ill-defined part of the format that we have to live with.
197  while( it && *it != '+' ) {
198 
199  // The function is only called internally, and only ever when we are at the beginning
200  // of a new line. Assert this.
201  assert( it.column() == 1 );
202 
203  // The get_line function appends to the buffer.
204  it.get_line( buffer_ );
205  }
206  assert( !it || *it == '+' );
207 
208  if( buffer_.length() == 0 ) {
209  throw std::runtime_error(
210  "Malformed Fastq " + it.source_name() + ": Empty sequence at line "
211  + std::to_string( it.line() - 1 ) + "."
212  );
213  }
214 
215  // Apply site casing, if needed.
216  if( site_casing_ == SiteCasing::kToUpper ) {
218  } else if( site_casing_ == SiteCasing::kToLower ) {
220  }
221 
222  // Validate, if needed.
223  if( use_validation_ ) {
224  for( auto const& c : buffer_ ) {
225  if( !lookup_[c] ) {
226  throw std::runtime_error(
227  "Malformed Fastq " + it.source_name() + ": Invalid sequence symbol "
228  + utils::char_to_hex( c )
229  + " in sequence near line " + std::to_string( it.line() - 1 ) + "."
230  );
231  }
232  }
233  }
234 
235  // Copy the buffer to the sequence sites, which removes surplus capacity.
236  sequence.sites( buffer_ );
237 }
238 
239 void FastqReader::parse_label2_( utils::InputStream& input_stream, Sequence& sequence ) const
240 {
241  auto& it = input_stream;
242  buffer_.clear();
243 
244  // Check beginning of sequence.
245  if( !it || *it != '+' ) {
246  throw std::runtime_error(
247  "Malformed Fastq " + it.source_name()
248  + ": Expecting '+' at beginning of sequence at line "
249  + std::to_string( it.line() ) + "."
250  );
251  }
252  assert( it && *it == '+' );
253  ++it;
254 
255  // Parse label. No need to run the vailidty check here again, as we can simply compare
256  // against line1 that was read before. So, we can use the buffer.
257  // The get_line function appends to the buffer.
258  it.get_line( buffer_ );
259 
260  if( ! buffer_.empty() && buffer_ != sequence.label() ) {
261  throw std::runtime_error(
262  "Malformed Fastq " + it.source_name() + ": Expecting the second label line to either " +
263  "be empty or equal to the first label line at line " + std::to_string( it.line() ) + "."
264  );
265  }
266 }
267 
268 void FastqReader::parse_quality_( utils::InputStream& input_stream, Sequence& sequence ) const
269 {
270  auto& it = input_stream;
271  buffer_.clear();
272 
273  // Check for unexpected end of file.
274  if( !it ) {
275  throw std::runtime_error(
276  "Malformed Fastq " + it.source_name()
277  + ": Expecting quality scores after the second label line at line "
278  + std::to_string( it.line() - 1 ) + "."
279  );
280  }
281  assert( it );
282 
283  // Parse qualities. At every beginning of the loop, we are at a line start.
284  // Continue until we have read as much characters as the sequence is long.
285  while( it && buffer_.size() < sequence.sites().size() ) {
286 
287  // Again, this function is only called internally, and only ever when we are at the
288  // beginning of a new line. Assert this.
289  assert( it.column() == 1 );
290 
291  // The get_line function appends to the buffer.
292  it.get_line( buffer_ );
293  }
294  assert( !it || buffer_.size() >= sequence.sites().size() );
295 
296  if( buffer_.size() != sequence.sites().size() ) {
297  throw std::runtime_error(
298  "Malformed Fastq " + it.source_name()
299  + ": Expecting the quality scores to be of the same length as the sequence at line "
300  + std::to_string( it.line() - 1 ) + "."
301  );
302  }
303 
304  // Run the plugin, if availabe.
305  if( quality_string_plugin_ ) {
306  quality_string_plugin_( buffer_, sequence );
307  }
308 }
309 
310 // =================================================================================================
311 // Properties
312 // =================================================================================================
313 
315 {
316  site_casing_ = value;
317  return *this;
318 }
319 
321 {
322  return site_casing_;
323 }
324 
325 FastqReader& FastqReader::valid_chars( std::string const& chars )
326 {
327  if( chars.size() == 0 ) {
328  lookup_.set_all( true );
329  use_validation_ = false;
330  } else {
331  lookup_.set_all( false );
332  lookup_.set_selection( chars, true );
333  use_validation_ = true;
334  }
335 
336  return *this;
337 }
338 
339 std::string FastqReader::valid_chars() const
340 {
341  // We need to check the valid chars lookup here, because we don't want to return a string
342  // of _all_ chars.
343  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
344  return "";
345  } else {
346  return lookup_.get_chars_equal_to( true );
347  }
348 }
349 
351 {
352  return lookup_;
353 }
354 
356 {
357  quality_encoding_ = encoding;
358  return *this;
359 }
360 
362 {
363  return quality_encoding_;
364 }
365 
367 {
368  quality_string_plugin_ = plugin;
369  return *this;
370 }
371 
372 
373 } // namespace sequence
374 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::sequence::Sequence::clear
void clear()
Definition: sequence/sequence.hpp:78
fastq_reader.hpp
genesis::sequence::FastqReader::SiteCasing::kToLower
@ kToLower
Make all sites lower case.
genesis::utils::read_while
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
Definition: scanner.hpp:216
fs.hpp
Provides functions for accessing the file system.
genesis::utils::to_upper_ascii_inplace
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
Definition: string.cpp:675
genesis::sequence::FastqReader::parse_label1_
void parse_label1_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the first label line (starting with an @).
Definition: fastq_reader.cpp:135
genesis::sequence::FastqReader::valid_chars
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
Definition: fastq_reader.cpp:339
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::utils::CharLookup::all_equal_to
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Definition: char_lookup.hpp:242
genesis::sequence::FastqReader::SiteCasing::kToUpper
@ kToUpper
Make all sites upper case.
genesis::utils::to_lower_ascii_inplace
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
Definition: string.cpp:651
sequence_set.hpp
genesis::sequence::FastqReader::read
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fastq format and return them as a SequenceSet.
Definition: fastq_reader.cpp:65
genesis::sequence::FastqReader
Read Fastq sequence data.
Definition: fastq_reader.hpp:149
std.hpp
Provides some valuable additions to STD.
genesis::sequence::FastqReader::parse_label2_
void parse_label2_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the second label line (starting with a +, and either empty or equal to the first).
Definition: fastq_reader.cpp:239
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::sequence::FastqReader::valid_char_lookup
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
Definition: fastq_reader.cpp:350
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::utils::CharLookup< bool >
genesis::utils::CharLookup::get_chars_equal_to
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Definition: char_lookup.hpp:228
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::FastqReader::parse_sites_
void parse_sites_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the sequence line(s).
Definition: fastq_reader.cpp:178
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:59
genesis::sequence::FastqReader::parse_document
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fastq document into a SequenceSet.
Definition: fastq_reader.cpp:85
char.hpp
genesis::sequence::FastqReader::parse_quality_
void parse_quality_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the quality score line(s), which also runs the plugin, if available.
Definition: fastq_reader.cpp:268
genesis::sequence::SequenceSet::add
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
Definition: sequence_set.cpp:86
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::sequence::FastqReader::FastqReader
FastqReader()
Create a default FastqReader.
Definition: fastq_reader.cpp:56
genesis::sequence::FastqReader::SiteCasing
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Definition: fastq_reader.hpp:177
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
scanner.hpp
genesis::sequence::FastqReader::site_casing
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
Definition: fastq_reader.cpp:320
genesis::sequence::FastqReader::quality_string_plugin
FastqReader & quality_string_plugin(quality_string_function const &plugin)
Functional that can be set to process the quality string found in fastq files.
Definition: fastq_reader.cpp:366
genesis::sequence::FastqReader::parse_sequence_
bool parse_sequence_(utils::InputStream &input_stream, Sequence &sequence) const
Parse a fastq sequence into the given sequence object.
Definition: fastq_reader.cpp:106
genesis::utils::CharLookup::set_all
void set_all(T value)
Set the lookup status for all chars at once.
Definition: char_lookup.hpp:192
genesis::sequence::FastqReader::quality_string_function
std::function< void(std::string const &quality_string, Sequence &sequence) > quality_string_function
Function type that allows to work with the quality line(s) in fastq files.
Definition: fastq_reader.hpp:172
genesis::sequence::FastqReader::parse_sequence
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fastq format.
Definition: fastq_reader.cpp:95
genesis::sequence::FastqReader::quality_encoding
QualityEncoding quality_encoding()
Return the currently set QualityEncoding that is used for decoding the quality score line of the Fast...
Definition: fastq_reader.cpp:361
genesis::utils::CharLookup::set_selection
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
Definition: char_lookup.hpp:157
sequence.hpp