|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
75 std::shared_ptr< utils::BaseInputSource > source,
92 sequence_set.
add( tmp_seq );
116 if( !input_stream ) {
138 auto& it = input_stream;
142 if( !it || *it !=
'@' ) {
143 throw std::runtime_error(
144 "Malformed Fastq " + it.source_name()
145 +
": Expecting '@' at beginning of sequence at line "
149 assert( it && *it ==
'@' );
153 it.get_line( buffer_ );
154 auto const buffer_is_print = std::all_of(
158 return utils::is_print( c );
161 if( buffer_.empty() || !buffer_is_print ) {
162 throw std::runtime_error(
163 "Malformed Fastq " + it.source_name() +
": Expecting valid label after '@' "
164 "in sequence at line " +
std::to_string( it.line() ) +
", but instead the label "
165 "is empty or contains non-printable characters."
170 sequence.
label( buffer_ );
176 auto& it = input_stream;
181 throw std::runtime_error(
182 "Malformed Fastq " + it.source_name()
183 +
": Expecting a sequence sites line after the first label line at line "
192 while( it && *it !=
'+' ) {
196 assert( it.column() == 1 );
199 it.get_line( buffer_ );
201 assert( !it || *it ==
'+' );
203 if( buffer_.length() == 0 ) {
204 throw std::runtime_error(
205 "Malformed Fastq " + it.source_name() +
": Empty sequence at line "
218 if( use_validation_ ) {
219 for(
auto const& c : buffer_ ) {
221 throw std::runtime_error(
222 "Malformed Fastq " + it.source_name() +
": Invalid sequence symbol "
224 +
" in sequence near line " +
std::to_string( it.line() - 1 ) +
"."
231 sequence.
sites( buffer_ );
236 auto& it = input_stream;
240 if( !it || *it !=
'+' ) {
241 throw std::runtime_error(
242 "Malformed Fastq " + it.source_name()
243 +
": Expecting '+' at beginning of sequence at line "
247 assert( it && *it ==
'+' );
253 it.get_line( buffer_ );
255 if( ! buffer_.empty() && buffer_ != sequence.
label() ) {
256 throw std::runtime_error(
257 "Malformed Fastq " + it.source_name() +
": Expecting the second label line to either " +
258 "be empty or equal to the first label line at line " +
std::to_string( it.line() ) +
"."
265 auto& it = input_stream;
270 throw std::runtime_error(
271 "Malformed Fastq " + it.source_name()
272 +
": Expecting quality scores after the second label line at line "
280 while( it && buffer_.size() < sequence.
sites().size() ) {
284 assert( it.column() == 1 );
287 it.get_line( buffer_ );
289 assert( !it || buffer_.size() >= sequence.
sites().size() );
291 if( buffer_.size() != sequence.
sites().size() ) {
292 throw std::runtime_error(
293 "Malformed Fastq " + it.source_name()
294 +
": Expecting the quality scores to be of the same length as the sequence at line "
300 if( quality_string_plugin_ ) {
301 quality_string_plugin_( buffer_, sequence );
311 site_casing_ = value;
322 if( chars.size() == 0 ) {
324 use_validation_ =
false;
328 use_validation_ =
true;
338 if( ! use_validation_ || lookup_.
all_equal_to(
true ) ) {
352 quality_encoding_ = encoding;
358 return quality_encoding_;
363 quality_string_plugin_ = plugin;
@ kToLower
Make all sites lower case.
Provides functions for accessing the file system.
void to_upper_ascii_inplace(std::string &str)
Turn the given string to all-uppercase, ASCII-only, inline.
void parse_label1_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the first label line (starting with an @).
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
@ kToUpper
Make all sites upper case.
void to_lower_ascii_inplace(std::string &str)
Turn the given string to all-lowercase, ASCII-only.
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fastq format and return them as a SequenceSet.
Read Fastq sequence data.
Provides some valuable additions to STD.
void parse_label2_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the second label line (starting with a +, and either empty or equal to the first).
std::string to_string(GenomeLocus const &locus)
Provides some commonly used string utility functions.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
QualityEncoding
List of quality encodings for which we support decoding.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
void parse_sites_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the sequence line(s).
Store a set of Sequences.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fastq document into a SequenceSet.
void parse_quality_(utils::InputStream &input_stream, Sequence &sequence) const
Parse the quality score line(s), which also runs the plugin, if available.
FastqReader()
Create a default FastqReader.
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
FastqReader & quality_string_plugin(quality_string_function const &plugin)
Functional that can be set to process the quality string found in fastq files.
bool parse_sequence_(utils::InputStream &input_stream, Sequence &sequence) const
Parse a fastq sequence into the given sequence object.
void set_all(T value)
Set the lookup status for all chars at once.
std::function< void(std::string const &quality_string, Sequence &sequence) > quality_string_function
Function type that allows to work with the quality line(s) in fastq files.
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fastq format.
QualityEncoding quality_encoding()
Return the currently set QualityEncoding that is used for decoding the quality score line of the Fast...
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.