A library for working with phylogenetic and population genetic data.
v0.32.0
fastq_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
41 
42 #include <algorithm>
43 #include <cassert>
44 #include <cctype>
45 #include <fstream>
46 #include <sstream>
47 #include <stdexcept>
48 #include <thread>
49 
50 namespace genesis {
51 namespace sequence {
52 
53 // =================================================================================================
54 // Constructor and Rule of Five
55 // =================================================================================================
56 
58 {
59  lookup_.set_all( true );
60 }
61 
62 // =================================================================================================
63 // Reading
64 // =================================================================================================
65 
66 SequenceSet FastqReader::read( std::shared_ptr< utils::BaseInputSource > source ) const
67 {
68  SequenceSet result;
69  utils::InputStream is( source );
70  parse_document( is, result );
71  return result;
72 }
73 
75  std::shared_ptr< utils::BaseInputSource > source,
76  SequenceSet& sequence_set
77 ) const {
78  utils::InputStream is( source );
79  parse_document( is, sequence_set );
80 }
81 
82 // =================================================================================================
83 // Parsing
84 // =================================================================================================
85 
87  utils::InputStream& input_stream,
88  SequenceSet& sequence_set
89 ) const {
90  Sequence tmp_seq;
91  while( parse_sequence_( input_stream, tmp_seq )) {
92  sequence_set.add( tmp_seq );
93  }
94 }
95 
97  utils::InputStream& input_stream,
98  Sequence& sequence
99 ) const {
100  return parse_sequence_( input_stream, sequence );
101 }
102 
103 // =================================================================================================
104 // Parsing Internals
105 // =================================================================================================
106 
107 bool FastqReader::parse_sequence_( utils::InputStream& input_stream, Sequence& sequence ) const
108 {
109  // Init. Call clear() in order to avoid not setting properties that might be added to
110  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
111  // should not change when setting the strings to empty strings.
112  sequence.clear();
113 
114  // Check for data. If there is nothing, stop here. If there is data, then from here on,
115  // we expect a full fastq sequence to be present - otherwise, one of the below functions throws.
116  if( !input_stream ) {
117  return false;
118  }
119 
120  // Parse all elements of a Fastq sequence.
121  // We use a shared buffer for all of these functions that is filled with data from the input
122  // stream. This has the following reasoning: Sequence files can be quite big, and appending to
123  // a string can cause its size to double whenver the capacity is reached (depending on the STL
124  // implementation). We want to avoid that, which can usually be done by making a fresh copy of
125  // the string. However, reading into a string first, and then making a copy of it, necessitates
126  // two memory allocations. We can circumvent the first by using this buffer, which is re-used.
127  // Hence, we here rely on the fact that string::clear() does not change the capacity of the buffer.
128  parse_label1_( input_stream, sequence );
129  parse_sites_( input_stream, sequence );
130  parse_label2_( input_stream, sequence );
131  parse_quality_( input_stream, sequence );
132 
133  return true;
134 }
135 
136 void FastqReader::parse_label1_( utils::InputStream& input_stream, Sequence& sequence ) const
137 {
138  auto& it = input_stream;
139  buffer_.clear();
140 
141  // Check beginning of sequence.
142  if( !it || *it != '@' ) {
143  throw std::runtime_error(
144  "Malformed Fastq " + it.source_name()
145  + ": Expecting '@' at beginning of sequence at line "
146  + std::to_string( it.line() ) + "."
147  );
148  }
149  assert( it && *it == '@' );
150  ++it;
151 
152  // Parse label.
153  it.get_line( buffer_ );
154  auto const buffer_is_print = std::all_of(
155  buffer_.cbegin(),
156  buffer_.cend(),
157  []( char c ){
158  return utils::is_print( c );
159  }
160  );
161  if( buffer_.empty() || !buffer_is_print ) {
162  throw std::runtime_error(
163  "Malformed Fastq " + it.source_name() + ": Expecting valid label after '@' "
164  "in sequence at line " + std::to_string( it.line() ) + ", but instead the label "
165  "is empty or contains non-printable characters."
166  );
167  }
168 
169  // Copy the label into the sequence, which also makes sure that we do not store extra capacity.
170  sequence.label( buffer_ );
171 }
172 
173 void FastqReader::parse_sites_( utils::InputStream& input_stream, Sequence& sequence ) const
174 {
175  // Some prep shorthand.
176  auto& it = input_stream;
177  buffer_.clear();
178 
179  // Check for unexpected end of file.
180  if( !it ) {
181  throw std::runtime_error(
182  "Malformed Fastq " + it.source_name()
183  + ": Expecting a sequence sites line after the first label line at line "
184  + std::to_string( it.line() - 1 ) + "."
185  );
186  }
187  assert( it );
188 
189  // Parse sequence. At every beginning of the loop, we are at a line start.
190  // Continue until we find the '+' char, which marks the beginning of the second label
191  // for the quality line(s). This is the ill-defined part of the format that we have to live with.
192  while( it && *it != '+' ) {
193 
194  // The function is only called internally, and only ever when we are at the beginning
195  // of a new line. Assert this.
196  assert( it.column() == 1 );
197 
198  // The get_line function appends to the buffer.
199  it.get_line( buffer_ );
200  }
201  assert( !it || *it == '+' );
202 
203  if( buffer_.length() == 0 ) {
204  throw std::runtime_error(
205  "Malformed Fastq " + it.source_name() + ": Empty sequence at line "
206  + std::to_string( it.line() - 1 ) + "."
207  );
208  }
209 
210  // Apply site casing, if needed.
211  if( site_casing_ == SiteCasing::kToUpper ) {
213  } else if( site_casing_ == SiteCasing::kToLower ) {
215  }
216 
217  // Validate, if needed.
218  if( use_validation_ ) {
219  for( auto const& c : buffer_ ) {
220  if( !lookup_[c] ) {
221  throw std::runtime_error(
222  "Malformed Fastq " + it.source_name() + ": Invalid sequence symbol "
223  + utils::char_to_hex( c )
224  + " in sequence near line " + std::to_string( it.line() - 1 ) + "."
225  );
226  }
227  }
228  }
229 
230  // Copy the buffer to the sequence sites, which removes surplus capacity.
231  sequence.sites( buffer_ );
232 }
233 
234 void FastqReader::parse_label2_( utils::InputStream& input_stream, Sequence& sequence ) const
235 {
236  auto& it = input_stream;
237  buffer_.clear();
238 
239  // Check beginning of sequence.
240  if( !it || *it != '+' ) {
241  throw std::runtime_error(
242  "Malformed Fastq " + it.source_name()
243  + ": Expecting '+' at beginning of sequence at line "
244  + std::to_string( it.line() ) + "."
245  );
246  }
247  assert( it && *it == '+' );
248  ++it;
249 
250  // Parse label. No need to run the vailidty check here again, as we can simply compare
251  // against line1 that was read before. So, we can use the buffer.
252  // The get_line function appends to the buffer.
253  it.get_line( buffer_ );
254 
255  if( ! buffer_.empty() && buffer_ != sequence.label() ) {
256  throw std::runtime_error(
257  "Malformed Fastq " + it.source_name() + ": Expecting the second label line to either " +
258  "be empty or equal to the first label line at line " + std::to_string( it.line() ) + "."
259  );
260  }
261 }
262 
263 void FastqReader::parse_quality_( utils::InputStream& input_stream, Sequence& sequence ) const
264 {
265  auto& it = input_stream;
266  buffer_.clear();
267 
268  // Check for unexpected end of file.
269  if( !it ) {
270  throw std::runtime_error(
271  "Malformed Fastq " + it.source_name()
272  + ": Expecting quality scores after the second label line at line "
273  + std::to_string( it.line() - 1 ) + "."
274  );
275  }
276  assert( it );
277 
278  // Parse qualities. At every beginning of the loop, we are at a line start.
279  // Continue until we have read as much characters as the sequence is long.
280  while( it && buffer_.size() < sequence.sites().size() ) {
281 
282  // Again, this function is only called internally, and only ever when we are at the
283  // beginning of a new line. Assert this.
284  assert( it.column() == 1 );
285 
286  // The get_line function appends to the buffer.
287  it.get_line( buffer_ );
288  }
289  assert( !it || buffer_.size() >= sequence.sites().size() );
290 
291  if( buffer_.size() != sequence.sites().size() ) {
292  throw std::runtime_error(
293  "Malformed Fastq " + it.source_name()
294  + ": Expecting the quality scores to be of the same length as the sequence at line "
295  + std::to_string( it.line() - 1 ) + "."
296  );
297  }
298 
299  // Run the plugin, if availabe.
300  if( quality_string_plugin_ ) {
301  quality_string_plugin_( buffer_, sequence );
302  }
303 }
304 
305 // =================================================================================================
306 // Properties
307 // =================================================================================================
308 
310 {
311  site_casing_ = value;
312  return *this;
313 }
314 
316 {
317  return site_casing_;
318 }
319 
320 FastqReader& FastqReader::valid_chars( std::string const& chars )
321 {
322  if( chars.size() == 0 ) {
323  lookup_.set_all( true );
324  use_validation_ = false;
325  } else {
326  lookup_.set_all( false );
327  lookup_.set_selection( chars, true );
328  use_validation_ = true;
329  }
330 
331  return *this;
332 }
333 
334 std::string FastqReader::valid_chars() const
335 {
336  // We need to check the valid chars lookup here, because we don't want to return a string
337  // of _all_ chars.
338  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
339  return "";
340  } else {
341  return lookup_.get_chars_equal_to( true );
342  }
343 }
344 
346 {
347  return lookup_;
348 }
349 
351 {
352  quality_encoding_ = encoding;
353  return *this;
354 }
355 
357 {
358  return quality_encoding_;
359 }
360 
362 {
363  quality_string_plugin_ = plugin;
364  return *this;
365 }
366 
367 
368 } // namespace sequence
369 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::sequence::Sequence::clear
void clear()
Definition: sequence/sequence.hpp:78
fastq_reader.hpp
genesis::sequence::FastqReader::SiteCasing::kToLower
@ kToLower
Make all sites lower case.
fs.hpp
Provides functions for accessing the file system.
genesis::utils::to_upper_ascii_inplace
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
Definition: string.cpp:902
genesis::sequence::FastqReader::parse_label1_
void parse_label1_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the first label line (starting with an @).
Definition: fastq_reader.cpp:136
genesis::sequence::FastqReader::valid_chars
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
Definition: fastq_reader.cpp:334
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::utils::CharLookup::all_equal_to
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Definition: char_lookup.hpp:242
genesis::sequence::FastqReader::SiteCasing::kToUpper
@ kToUpper
Make all sites upper case.
genesis::utils::to_lower_ascii_inplace
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
Definition: string.cpp:878
sequence_set.hpp
genesis::sequence::FastqReader::read
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fastq format and return them as a SequenceSet.
Definition: fastq_reader.cpp:66
genesis::sequence::FastqReader
Read Fastq sequence data.
Definition: fastq_reader.hpp:149
std.hpp
Provides some valuable additions to STD.
genesis::sequence::FastqReader::parse_label2_
void parse_label2_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the second label line (starting with a +, and either empty or equal to the first).
Definition: fastq_reader.cpp:234
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::sequence::FastqReader::valid_char_lookup
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
Definition: fastq_reader.cpp:345
genesis::sequence::QualityEncoding
QualityEncoding
List of quality encodings for which we support decoding.
Definition: quality.hpp:72
genesis::utils::CharLookup< bool >
genesis::utils::CharLookup::get_chars_equal_to
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Definition: char_lookup.hpp:228
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::FastqReader::parse_sites_
void parse_sites_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the sequence line(s).
Definition: fastq_reader.cpp:173
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:53
genesis::sequence::SequenceSet::add
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
Definition: sequence_set.hpp:133
genesis::sequence::FastqReader::parse_document
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fastq document into a SequenceSet.
Definition: fastq_reader.cpp:86
char.hpp
genesis::sequence::FastqReader::parse_quality_
void parse_quality_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the quality score line(s), which also runs the plugin, if available.
Definition: fastq_reader.cpp:263
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::sequence::FastqReader::FastqReader
FastqReader()
Create a default FastqReader.
Definition: fastq_reader.cpp:57
genesis::sequence::FastqReader::SiteCasing
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Definition: fastq_reader.hpp:177
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
scanner.hpp
genesis::sequence::FastqReader::site_casing
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
Definition: fastq_reader.cpp:315
genesis::sequence::FastqReader::quality_string_plugin
FastqReader & quality_string_plugin(quality_string_function const &plugin)
Functional that can be set to process the quality string found in fastq files.
Definition: fastq_reader.cpp:361
genesis::sequence::FastqReader::parse_sequence_
bool parse_sequence_(utils::InputStream &input_stream, Sequence &sequence) const
Parse a fastq sequence into the given sequence object.
Definition: fastq_reader.cpp:107
genesis::utils::CharLookup::set_all
void set_all(T value)
Set the lookup status for all chars at once.
Definition: char_lookup.hpp:192
genesis::sequence::FastqReader::quality_string_function
std::function< void(std::string const &quality_string, Sequence &sequence) > quality_string_function
Function type that allows to work with the quality line(s) in fastq files.
Definition: fastq_reader.hpp:172
genesis::sequence::FastqReader::parse_sequence
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fastq format.
Definition: fastq_reader.cpp:96
genesis::sequence::FastqReader::quality_encoding
QualityEncoding quality_encoding()
Return the currently set QualityEncoding that is used for decoding the quality score line of the Fast...
Definition: fastq_reader.cpp:356
genesis::utils::CharLookup::set_selection
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
Definition: char_lookup.hpp:157
sequence.hpp