|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
68 parse_document_( input_stream, result );
73 std::shared_ptr< utils::BaseInputSource > source,
77 parse_document_( input_stream, sequence_set );
84 parse_document_( input_stream, result );
89 std::shared_ptr<utils::BaseInputSource> source,
90 bool also_look_up_first_word
94 parse_document_( input_stream, result, also_look_up_first_word );
106 parse_document_( input_stream, sequence_set );
116 auto& it = input_stream;
129 if( !it || *it !=
'>' ) {
130 throw std::runtime_error(
131 "Malformed Fasta " + it.source_name()
132 +
": Expecting '>' at beginning of sequence at line " +
std::to_string( it.line() ) +
"."
135 assert( it && *it ==
'>' );
140 it.get_line( buffer_ );
141 auto const buffer_is_print = std::all_of(
145 return utils::is_print( c );
148 if( buffer_.empty() || !buffer_is_print ) {
149 throw std::runtime_error(
150 "Malformed Fasta " + it.source_name() +
": Expecting valid label after '>' "
151 "in sequence at line " +
std::to_string( it.line() ) +
", but instead the label "
152 "is empty or contains non-printable characters."
155 if( guess_abundances_ ) {
157 sequence.
label( la.first );
160 sequence.
label( buffer_ );
168 while( it && *it ==
';' ) {
170 assert( it && *it ==
'\n' );
176 throw std::runtime_error(
177 "Malformed Fasta " + it.source_name()
178 +
": Expecting a sequence after the label line in sequence at line "
190 while( it && *it !=
'>' ) {
191 assert( it.column() == 1 );
192 it.get_line( buffer_ );
194 assert( !it || *it ==
'>' );
196 if( buffer_.length() == 0 ) {
197 throw std::runtime_error(
198 "Malformed Fasta " + it.source_name() +
": Empty sequence at line "
211 sequence.
sites() = buffer_;
214 if( use_validation_ ) {
215 for(
auto const& c : sequence.
sites() ) {
217 throw std::runtime_error(
218 "Malformed Fasta " + it.source_name() +
": Invalid sequence symbol "
220 +
" in the sequence at/above line " +
std::to_string( it.line() - 1 ) +
"."
236 auto& it = input_stream;
245 if( it.current() !=
'>' ) {
246 throw std::runtime_error(
247 "Malformed Fasta " + it.source_name()
248 +
": Expecting '>' at beginning of sequence at " + it.at() +
"."
251 assert( it && *it ==
'>' );
257 throw std::runtime_error(
258 "Malformed Fasta " + it.source_name()
259 +
": Expecting label after '>' at " + it.at() +
"."
262 if( guess_abundances_ ) {
264 sequence.
label( la.first );
267 sequence.
label( label );
271 if( !it || ( *it !=
'\n' )) {
272 throw std::runtime_error(
273 "Malformed Fasta " + it.source_name()
274 +
": Expecting a sequence after the label line at " + it.at() +
"."
277 assert( it && (*it ==
'\n' ));
280 if( !it || *it !=
'\n' ) {
281 throw std::runtime_error(
282 "Malformed Fasta " + it.source_name()
283 +
": Expecting a sequence after the label line at " + it.at() +
"."
286 assert( it && *it ==
'\n' );
289 while( it && *it ==
';' ) {
294 if( !it || *it !=
'\n' ) {
295 throw std::runtime_error(
296 "Malformed Fasta " + it.source_name()
297 +
": Expecting a sequence after the label line at " + it.at() +
"."
300 assert( it && *it ==
'\n' );
305 while( it && *it !=
'>' ) {
306 assert( it.column() == 1 );
309 while( it && *it !=
'\n' ) {
315 c =
static_cast<char>( std::toupper(
static_cast<unsigned char>( c )));
317 c =
static_cast<char>( std::tolower(
static_cast<unsigned char>( c )));
319 if( use_validation_ && ! lookup_[c] ) {
320 throw std::runtime_error(
321 "Malformed Fasta " + it.source_name() +
": Invalid sequence symbol "
332 throw std::runtime_error(
333 "Malformed Fasta " + it.source_name()
334 +
": Empty sequence line at " + it.at() +
"."
339 throw std::runtime_error(
340 "Malformed Fasta " + it.source_name()
341 +
": Sequence line does not end with '\\n' at " + it.at() +
"."
344 assert( it && *it ==
'\n' );
347 assert( !it || *it ==
'>' );
349 if( sites.length() == 0 ) {
350 throw std::runtime_error(
351 "Malformed Fasta " + it.source_name()
352 +
": Empty sequence at " + it.at() +
"."
357 sequence.
sites() = sites;
368 parsing_method_ = value;
374 return parsing_method_;
379 site_casing_ = value;
390 guess_abundances_ = value;
396 return guess_abundances_;
401 if( chars.size() == 0 ) {
403 use_validation_ =
false;
407 use_validation_ =
true;
417 if( ! use_validation_ || lookup_.
all_equal_to(
true ) ) {
@ kToLower
Make all sites lower case.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Provides functions for accessing the file system.
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
ReferenceGenome read_reference_genome(std::shared_ptr< utils::BaseInputSource > source, bool also_look_up_first_word=true) const
Read all Sequences from an input source in fasta format into a ReferenceGenome.
Read Fasta sequence data.
SequenceDict read_dict(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in fasta format, but only return their names and lengths as a...
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Provides some valuable additions to STD.
std::string to_string(GenomeLocus const &locus)
Provides some commonly used string utility functions.
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Store a set of Sequences.
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fasta format and return them as a SequenceSet.
ParsingMethod parsing_method() const
Return the currently set parsing method.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void set_all(T value)
Set the lookup status for all chars at once.
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
Lookup of Sequences of a reference genome.
@ kToUpper
Make all sites upper case.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.