A toolkit for working with phylogenetic data.
v0.24.0
fastq_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
42 #include <cassert>
43 #include <cctype>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 #include <thread>
48 
49 namespace genesis {
50 namespace sequence {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
57 {
58  lookup_.set_all( true );
59 }
60 
61 // =================================================================================================
62 // Reading
63 // =================================================================================================
64 
65 SequenceSet FastqReader::read( std::shared_ptr< utils::BaseInputSource > source ) const
66 {
67  SequenceSet result;
68  utils::InputStream is( source );
69  parse_document( is, result );
70  return result;
71 }
72 
74  std::shared_ptr< utils::BaseInputSource > source,
75  SequenceSet& sequence_set
76 ) const {
77  utils::InputStream is( source );
78  parse_document( is, sequence_set );
79 }
80 
81 // =================================================================================================
82 // Parsing
83 // =================================================================================================
84 
86  utils::InputStream& input_stream,
87  SequenceSet& sequence_set
88 ) const {
89  std::string buffer;
90  Sequence tmp_seq;
91  while( parse_sequence_( input_stream, buffer, tmp_seq )) {
92  sequence_set.add( tmp_seq );
93  }
94 }
95 
97  utils::InputStream& input_stream,
98  Sequence& sequence
99 ) const {
100  static std::string buffer;
101  return parse_sequence_( input_stream, buffer, sequence );
102 }
103 
104 // =================================================================================================
105 // Parsing Internals
106 // =================================================================================================
107 
108 bool FastqReader::parse_sequence_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const
109 {
110  // Init. Call clear() in order to avoid not setting properties that might be added to
111  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
112  // should not change when setting the strings to empty strings.
113  sequence.clear();
114 
115  // Check for data. If there is nothing, stop here. If there is data, then from here on,
116  // we expect a full fastq sequence to be present - otherwise, one of the below functions throws.
117  if( !input_stream ) {
118  return false;
119  }
120 
121  // Parse all elements of a Fastq sequence.
122  // We use a shared buffer for all of these functions that is filled with data from the input
123  // stream. This has the following reasoning: Sequence files can be quite big, and appending to
124  // a string can cause its size to double whenver the capacity is reached (depending on the STL
125  // implementation). We want to avoid that, which can usually be done by making a fresh copy of
126  // the sting. However, reading into a string first, and then making a copy of it, necessitates
127  // two memory allocations. We can circumvent the first by using this buffer, which is re-used.
128  // Hence, we here rely on the fact that string::clear() does not change the capacity of the buffer.
129  parse_label1_( input_stream, buffer, sequence );
130  parse_sites_( input_stream, buffer, sequence );
131  parse_label2_( input_stream, buffer, sequence );
132  parse_quality_( input_stream, buffer, sequence );
133 
134  return true;
135 }
136 
137 void FastqReader::parse_label1_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const
138 {
139  // Not using the buffer here, as the reading function in here overwrites the whole string,
140  // hence, the capacity of the buffer would shring, which destroys its whole purpose.
141  (void) buffer;
142  auto& it = input_stream;
143 
144  // Check beginning of sequence.
145  if( !it || *it != '@' ) {
146  throw std::runtime_error(
147  "Malformed Fastq " + it.source_name()
148  + ": Expecting '@' at beginning of sequence at line "
149  + std::to_string( it.line() ) + "."
150  );
151  }
152  assert( it && *it == '@' );
153  ++it;
154 
155  // Parse label.
156  auto const label1 = utils::read_while( it, isprint );
157  if( label1 == "" ) {
158  throw std::runtime_error(
159  "Malformed Fastq " + it.source_name()
160  + ": Expecting label after '@' in sequence at line "
161  + std::to_string( it.line() ) + "."
162  );
163  }
164 
165  // TODO do the isprint check after reading the whole line, for speed reasons.
166  // TODO also, use the ascii only version, again for speed
167 
168  // Check for unexpected end of file.
169  if( !it || *it != '\n' ) {
170  throw std::runtime_error(
171  "Malformed Fastq " + it.source_name()
172  + ": Unexpected characters at the end of the label line in sequence at line "
173  + std::to_string( it.line() ) + "."
174  );
175  }
176  assert( it && *it == '\n' );
177  ++it;
178 
179  // Copy the label into the sequence, which also makes sure that we do not store extra capacity.
180  sequence.label( label1 );
181 }
182 
183 void FastqReader::parse_sites_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const
184 {
185  // Some prep shorthand.
186  auto& it = input_stream;
187  buffer.clear();
188 
189  // Check for unexpected end of file.
190  if( !it ) {
191  throw std::runtime_error(
192  "Malformed Fastq " + it.source_name()
193  + ": Expecting a sequence sites line after the first label line at line "
194  + std::to_string( it.line() - 1 ) + "."
195  );
196  }
197  assert( it );
198 
199  // Parse sequence. At every beginning of the loop, we are at a line start.
200  // Continue until we find the '+' char, which marks the beginning of the second label
201  // for the quality line(s). This is the ill-defined part of the format that we have to live with.
202  while( it && *it != '+' ) {
203 
204  // The function is only called internally, and only ever when we are at the beginning
205  // of a new line. Assert this.
206  assert( it.column() == 1 );
207 
208  // The get_line function appends to the buffer.
209  it.get_line( buffer );
210  }
211  assert( !it || *it == '+' );
212 
213  if( buffer.length() == 0 ) {
214  throw std::runtime_error(
215  "Malformed Fastq " + it.source_name() + ": Empty sequence at line "
216  + std::to_string( it.line() - 1 ) + "."
217  );
218  }
219 
220  // Apply site casing, if needed.
221  if( site_casing_ == SiteCasing::kToUpper ) {
223  } else if( site_casing_ == SiteCasing::kToLower ) {
225  }
226 
227  // Validate, if needed.
228  if( use_validation_ ) {
229  for( auto const& c : buffer ) {
230  if( !lookup_[c] ) {
231  throw std::runtime_error(
232  "Malformed Fastq " + it.source_name() + ": Invalid sequence symbol "
233  + utils::char_to_hex( c )
234  + " in sequence near line " + std::to_string( it.line() - 1 ) + "."
235  );
236  }
237  }
238  }
239 
240  // Copy the buffer to the sequence sites, which removes surplus capacity.
241  sequence.sites( buffer );
242 }
243 
244 void FastqReader::parse_label2_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const
245 {
246  auto& it = input_stream;
247  buffer.clear();
248 
249  // Check beginning of sequence.
250  if( !it || *it != '+' ) {
251  throw std::runtime_error(
252  "Malformed Fastq " + it.source_name()
253  + ": Expecting '+' at beginning of sequence at line "
254  + std::to_string( it.line() ) + "."
255  );
256  }
257  assert( it && *it == '+' );
258  ++it;
259 
260  // Parse label. No need to run the vailidty check here again, as we can simply compare
261  // against line1 that was read before. So, we can use the buffer.
262  // The get_line function appends to the buffer.
263  it.get_line( buffer );
264 
265  if( ! buffer.empty() && buffer != sequence.label() ) {
266  throw std::runtime_error(
267  "Malformed Fastq " + it.source_name() + ": Expecting the second label line to either " +
268  "be empty or equal to the first label line at line " + std::to_string( it.line() ) + "."
269  );
270  }
271 }
272 
273 void FastqReader::parse_quality_( utils::InputStream& input_stream, std::string& buffer, Sequence& sequence ) const
274 {
275  auto& it = input_stream;
276  buffer.clear();
277 
278  // Check for unexpected end of file.
279  if( !it ) {
280  throw std::runtime_error(
281  "Malformed Fastq " + it.source_name()
282  + ": Expecting quality scores after the second label line at line "
283  + std::to_string( it.line() - 1 ) + "."
284  );
285  }
286  assert( it );
287 
288  // Parse qualities. At every beginning of the loop, we are at a line start.
289  // Continue until we have read as much characters as the sequence is long.
290  while( it && buffer.size() < sequence.sites().size() ) {
291 
292  // Again, this function is only called internally, and only ever when we are at the
293  // beginning of a new line. Assert this.
294  assert( it.column() == 1 );
295 
296  // The get_line function appends to the buffer.
297  it.get_line( buffer );
298  }
299  assert( !it || buffer.size() >= sequence.sites().size() );
300 
301  if( buffer.size() != sequence.sites().size() ) {
302  throw std::runtime_error(
303  "Malformed Fastq " + it.source_name()
304  + ": Expecting the quality scores to be of the same length as the sequence at line "
305  + std::to_string( it.line() - 1 ) + "."
306  );
307  }
308 
309  // Run the plugin, if availabe.
310  if( quality_string_plugin_ ) {
311  quality_string_plugin_( buffer, sequence );
312  }
313 }
314 
315 // =================================================================================================
316 // Properties
317 // =================================================================================================
318 
320 {
321  site_casing_ = value;
322  return *this;
323 }
324 
326 {
327  return site_casing_;
328 }
329 
330 FastqReader& FastqReader::valid_chars( std::string const& chars )
331 {
332  if( chars.size() == 0 ) {
333  lookup_.set_all( true );
334  use_validation_ = false;
335  } else {
336  lookup_.set_all( false );
337  lookup_.set_selection( chars, true );
338  use_validation_ = true;
339  }
340 
341  return *this;
342 }
343 
344 std::string FastqReader::valid_chars() const
345 {
346  // We need to check the valid chars lookup here, because we don't want to return a string
347  // of _all_ chars.
348  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
349  return "";
350  } else {
351  return lookup_.get_chars_equal_to( true );
352  }
353 }
354 
356 {
357  return lookup_;
358 }
359 
361 {
362  quality_encoding_ = encoding;
363  return *this;
364 }
365 
367 {
368  return quality_encoding_;
369 }
370 
372 {
373  quality_string_plugin_ = plugin;
374  return *this;
375 }
376 
377 
378 } // namespace sequence
379 } // namespace genesis
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
void set_all(T value)
Set the lookup status for all chars at once.
Read Fastq sequence data.
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
Definition: string.cpp:467
bool parse_sequence_(utils::InputStream &input_stream, std::string &buffer, Sequence &sequence) const
Parse a fastq sequence into the given sequence object.
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fastq format.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:67
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fastq format and return them as a SequenceSet.
Provides some valuable additions to STD.
void parse_label2_(utils::InputStream &input_stream, std::string &buffer, Sequence &sequence) const
Parse the second label line (starting with a +, and either empty or equal to the first).
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:216
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
Definition: string.cpp:443
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
void parse_quality_(utils::InputStream &input_stream, std::string &buffer, Sequence &sequence) const
Parse the quality score line(s), which also runs the plugin, if available.
Provides some commonly used string utility functions.
void parse_sites_(utils::InputStream &input_stream, std::string &buffer, Sequence &sequence) const
Parse the sequence line(s).
FastqReader()
Create a default FastqReader.
Provides functions for accessing the file system.
Store a set of Sequences.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:71
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
QualityEncoding quality_encoding()
Return the currently set QualityEncoding that is used for decoding the quality score line of the Fast...
void parse_label1_(utils::InputStream &input_stream, std::string &buffer, Sequence &sequence) const
Parse the first label line (starting with an @).
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
FastqReader & quality_string_plugin(quality_string_function const &plugin)
Functional that can be set to process the quality string found in fastq files.
std::function< void(std::string const &quality_string, Sequence &sequence) > quality_string_function
Function type that allows to work with the quality line(s) in fastq files.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fastq document into a SequenceSet.