|
A library for working with phylogenetic and population genetic data.
v0.27.0
|
|
Go to the documentation of this file.
74 std::shared_ptr< utils::BaseInputSource > source,
93 sequence_set.
add( seq );
98 sequence_set.
add( seq );
114 auto& it = input_stream;
131 if( !it || *it !=
'>' ) {
132 throw std::runtime_error(
133 "Malformed Fasta " + it.source_name()
134 +
": Expecting '>' at beginning of sequence at line " +
std::to_string( it.line() ) +
"."
137 assert( it && *it ==
'>' );
143 throw std::runtime_error(
144 "Malformed Fasta " + it.source_name()
145 +
": Expecting label after '>' in sequence at line " +
std::to_string( it.line() ) +
"."
148 if( guess_abundances_ ) {
150 sequence.
label( la.first );
153 sequence.
label( label );
157 if( !it || *it !=
'\n' ) {
158 throw std::runtime_error(
159 "Malformed Fasta " + it.source_name()
160 +
": Unexpected characters at the end of the label line in sequence at line "
164 assert( it && *it ==
'\n' );
174 while( it && *it ==
';' ) {
176 assert( it && *it ==
'\n' );
182 throw std::runtime_error(
183 "Malformed Fasta " + it.source_name()
184 +
": Expecting a sequence after the label line in sequence at line "
197 while( it && *it !=
'>' ) {
198 assert( it.column() == 1 );
199 it.get_line( sites );
201 assert( !it || *it ==
'>' );
203 if( sites.length() == 0 ) {
204 throw std::runtime_error(
205 "Malformed Fasta " + it.source_name() +
": Empty sequence at line "
218 sequence.
sites() = sites;
221 if( use_validation_ ) {
222 for(
auto const& c : sequence.
sites() ) {
224 throw std::runtime_error(
225 "Malformed Fasta " + it.source_name() +
": Invalid sequence symbol "
227 +
" in the sequence at/above line " +
std::to_string( it.line() - 1 ) +
"."
243 auto& it = input_stream;
252 if( it.current() !=
'>' ) {
253 throw std::runtime_error(
254 "Malformed Fasta " + it.source_name()
255 +
": Expecting '>' at beginning of sequence at " + it.at() +
"."
258 assert( it && *it ==
'>' );
264 throw std::runtime_error(
265 "Malformed Fasta " + it.source_name()
266 +
": Expecting label after '>' at " + it.at() +
"."
269 if( guess_abundances_ ) {
271 sequence.
label( la.first );
274 sequence.
label( label );
278 if( !it || ( *it !=
'\n' )) {
279 throw std::runtime_error(
280 "Malformed Fasta " + it.source_name()
281 +
": Expecting a sequence after the label line at " + it.at() +
"."
284 assert( it && (*it ==
'\n' ));
287 if( !it || *it !=
'\n' ) {
288 throw std::runtime_error(
289 "Malformed Fasta " + it.source_name()
290 +
": Expecting a sequence after the label line at " + it.at() +
"."
293 assert( it && *it ==
'\n' );
296 while( it && *it ==
';' ) {
301 if( !it || *it !=
'\n' ) {
302 throw std::runtime_error(
303 "Malformed Fasta " + it.source_name()
304 +
": Expecting a sequence after the label line at " + it.at() +
"."
307 assert( it && *it ==
'\n' );
312 while( it && *it !=
'>' ) {
313 assert( it.column() == 1 );
316 while( it && *it !=
'\n' ) {
322 c =
static_cast<char>( std::toupper(
static_cast<unsigned char>( c )));
324 c =
static_cast<char>( std::tolower(
static_cast<unsigned char>( c )));
326 if( use_validation_ && ! lookup_[c] ) {
327 throw std::runtime_error(
328 "Malformed Fasta " + it.source_name() +
": Invalid sequence symbol "
339 throw std::runtime_error(
340 "Malformed Fasta " + it.source_name()
341 +
": Empty sequence line at " + it.at() +
"."
346 throw std::runtime_error(
347 "Malformed Fasta " + it.source_name()
348 +
": Sequence line does not end with '\\n' at " + it.at() +
"."
351 assert( it && *it ==
'\n' );
354 assert( !it || *it ==
'>' );
356 if( sites.length() == 0 ) {
357 throw std::runtime_error(
358 "Malformed Fasta " + it.source_name()
359 +
": Empty sequence at " + it.at() +
"."
364 sequence.
sites() = sites;
375 parsing_method_ = value;
381 return parsing_method_;
386 site_casing_ = value;
397 guess_abundances_ = value;
403 return guess_abundances_;
408 if( chars.size() == 0 ) {
410 use_validation_ =
false;
414 use_validation_ =
true;
424 if( ! use_validation_ || lookup_.
all_equal_to(
true ) ) {
@ kToLower
Make all sites lower case.
@ kPedantic
Pedantic method.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one....
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Provides functions for accessing the file system.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Read Fasta sequence data.
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Provides some valuable additions to STD.
std::string to_string(GenomeLocus const &locus)
Provides some commonly used string utility functions.
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one.
@ kDefault
Fast method, used by default.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Store a set of Sequences.
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fasta format and return them as a SequenceSet.
ParsingMethod parsing_method() const
Return the currently set parsing method.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void set_all(T value)
Set the lookup status for all chars at once.
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
@ kToUpper
Make all sites upper case.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.