74 std::shared_ptr< utils::BaseInputSource > source,
93 sequence_set.
add( seq );
98 sequence_set.
add( seq );
114 auto& it = input_stream;
131 if( !it || *it !=
'>' ) {
132 throw std::runtime_error(
133 "Malformed Fasta " + it.source_name()
134 +
": Expecting '>' at beginning of sequence at line " +
std::to_string( it.line() ) +
"." 137 assert( it && *it ==
'>' );
143 throw std::runtime_error(
144 "Malformed Fasta " + it.source_name()
145 +
": Expecting label after '>' in sequence at line " +
std::to_string( it.line() ) +
"." 148 if( guess_abundances_ ) {
150 sequence.
label( la.first );
153 sequence.
label( label );
157 if( !it || *it !=
'\n' ) {
158 throw std::runtime_error(
159 "Malformed Fasta " + it.source_name()
160 +
": Unexpected characters at the end of the label line in sequence at line " 164 assert( it && *it ==
'\n' );
174 while( it && *it ==
';' ) {
176 assert( it && *it ==
'\n' );
182 throw std::runtime_error(
183 "Malformed Fasta " + it.source_name()
184 +
": Expecting a sequence after the label line in sequence at line " 197 while( it && *it !=
'>' ) {
198 assert( it.column() == 1 );
199 it.get_line( sites );
201 assert( !it || *it ==
'>' );
203 if( sites.length() == 0 ) {
204 throw std::runtime_error(
205 "Malformed Fasta " + it.source_name() +
": Empty sequence at line " 218 sequence.
sites() = sites;
221 if( use_validation_ ) {
222 for(
auto const& c : sequence.
sites() ) {
224 throw std::runtime_error(
225 "Malformed Fasta " + it.source_name() +
": Invalid sequence symbol " 227 +
" in the sequence at/above line " +
std::to_string( it.line() - 1 ) +
"." 243 auto& it = input_stream;
252 if( it.current() !=
'>' ) {
253 throw std::runtime_error(
254 "Malformed Fasta " + it.source_name()
255 +
": Expecting '>' at beginning of sequence at " + it.at() +
"." 258 assert( it && *it ==
'>' );
264 throw std::runtime_error(
265 "Malformed Fasta " + it.source_name()
266 +
": Expecting label after '>' at " + it.at() +
"." 269 if( guess_abundances_ ) {
271 sequence.
label( la.first );
274 sequence.
label( label );
278 if( !it || ( *it !=
'\n' )) {
279 throw std::runtime_error(
280 "Malformed Fasta " + it.source_name()
281 +
": Expecting a sequence after the label line at " + it.at() +
"." 284 assert( it && (*it ==
'\n' ));
287 if( !it || *it !=
'\n' ) {
288 throw std::runtime_error(
289 "Malformed Fasta " + it.source_name()
290 +
": Expecting a sequence after the label line at " + it.at() +
"." 293 assert( it && *it ==
'\n' );
296 while( it && *it ==
';' ) {
301 if( !it || *it !=
'\n' ) {
302 throw std::runtime_error(
303 "Malformed Fasta " + it.source_name()
304 +
": Expecting a sequence after the label line at " + it.at() +
"." 307 assert( it && *it ==
'\n' );
312 while( it && *it !=
'>' ) {
313 assert( it.column() == 1 );
316 while( it && *it !=
'\n' ) {
322 c =
static_cast<char>( std::toupper( static_cast<unsigned char>( c )));
324 c =
static_cast<char>( std::tolower( static_cast<unsigned char>( c )));
326 if( use_validation_ && ! lookup_[c] ) {
327 throw std::runtime_error(
328 "Malformed Fasta " + it.source_name() +
": Invalid sequence symbol " 339 throw std::runtime_error(
340 "Malformed Fasta " + it.source_name()
341 +
": Empty sequence line at " + it.at() +
"." 346 throw std::runtime_error(
347 "Malformed Fasta " + it.source_name()
348 +
": Sequence line does not end with '\\n' at " + it.at() +
"." 351 assert( it && *it ==
'\n' );
354 assert( !it || *it ==
'>' );
356 if( sites.length() == 0 ) {
357 throw std::runtime_error(
358 "Malformed Fasta " + it.source_name()
359 +
": Empty sequence at " + it.at() +
"." 364 sequence.
sites() = sites;
375 parsing_method_ = value;
381 return parsing_method_;
386 site_casing_ = value;
397 guess_abundances_ = value;
403 return guess_abundances_;
408 if( chars.size() == 0 ) {
410 use_validation_ =
false;
414 use_validation_ =
true;
424 if( ! use_validation_ || lookup_.
all_equal_to(
true ) ) {
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void set_all(T value)
Set the lookup status for all chars at once.
Read Fasta sequence data.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
Make all sites lower case.
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
std::string to_upper_ascii(std::string const &str)
Return an all-uppercase copy of the given string, ASCII-only.
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Provides some valuable additions to STD.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Provides some commonly used string utility functions.
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
Provides functions for accessing the file system.
Store a set of Sequences.
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
SequenceSet read(std::shared_ptr< utils::BaseInputSource > source) const
Read all Sequences from an input source in Fasta format and return them as a SequenceSet.
ParsingMethod parsing_method() const
Return the currently set parsing method.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
Fast method, used by default.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Make all sites upper case.
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.