A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fasta_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
42 
43 #include <assert.h>
44 #include <cctype>
45 #include <fstream>
46 #include <sstream>
47 #include <stdexcept>
48 
49 namespace genesis {
50 namespace sequence {
51 
52 // =================================================================================================
53 // Constructor and Rule of Five
54 // =================================================================================================
55 
57 {
58  lookup_.set_all( true );
59 }
60 
61 // =================================================================================================
62 // Reading
63 // =================================================================================================
64 
65 void FastaReader::from_stream ( std::istream& input_stream, SequenceSet& sequence_set ) const
66 {
67  // Create an input stream and process it.
68  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( input_stream ));
69  parse_document( it, sequence_set );
70 }
71 
72 SequenceSet FastaReader::from_stream( std::istream& input_stream ) const
73 {
74  // Create a new set and fill it.
75  SequenceSet result;
76  from_stream( input_stream, result );
77  return result;
78 }
79 
80 void FastaReader::from_file ( std::string const& file_name, SequenceSet& sequence_set ) const
81 {
82  // Create an input stream and process it.
83  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
84  parse_document( it, sequence_set );
85 }
86 
87 SequenceSet FastaReader::from_file( std::string const& file_name ) const
88 {
89  // Create a new set and fill it.
90  SequenceSet result;
91  from_file( file_name, result );
92  return result;
93 }
94 
95 void FastaReader::from_string ( std::string const& input_string, SequenceSet& sequence_set ) const
96 {
97  // Create an input stream and process it.
98  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
99  parse_document( it, sequence_set );
100 }
101 
102 SequenceSet FastaReader::from_string( std::string const& input_string ) const
103 {
104  // Create a new set and fill it.
105  SequenceSet result;
106  from_string( input_string, result );
107  return result;
108 }
109 
110 // =================================================================================================
111 // Parsing
112 // =================================================================================================
113 
115  utils::InputStream& input_stream,
116  SequenceSet& sequence_set
117 ) const {
118  Sequence seq;
119 
120  if( parsing_method_ == ParsingMethod::kDefault ) {
121  while( parse_sequence( input_stream, seq ) ) {
122  sequence_set.add( seq );
123  }
124 
125  } else if( parsing_method_ == ParsingMethod::kPedantic ) {
126  while( parse_sequence_pedantic( input_stream, seq ) ) {
127  sequence_set.add( seq );
128  }
129 
130  } else {
131  // There are no other methods currently implemented.
132  assert( false );
133  }
134 }
135 
137  utils::InputStream& input_stream,
138  Sequence& sequence
139 ) const {
140  // Init. Call clear in order to avoid not setting properties that might be added to
141  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
142  // should not change when setting the strings to empty strings.
143  auto& it = input_stream;
144  sequence.clear();
145 
146  // Check for data.
147  if( !it ) {
148  return false;
149  }
150 
151  // Scope to ensure that the label line is only used
152  // while we actually are in that line.
153  {
154 
155  // Get the label line.
156  auto line_pair = it.get_line();
157  char* line = line_pair.first;
158 
159  // Check beginning of sequence.
160  if( line == nullptr || *line != '>' ) {
161  throw std::runtime_error(
162  "Malformed Fasta " + it.source_name()
163  + ": Expecting '>' at beginning of sequence at line " + std::to_string( it.line() - 1 ) + "."
164  );
165  }
166  assert( line && *line == '>' );
167  ++line;
168 
169  // Parse label.
170  std::string label = utils::read_while( line, isprint );
171  if( label == "" ) {
172  throw std::runtime_error(
173  "Malformed Fasta " + it.source_name()
174  + ": Expecting label after '>' in sequence at line " + std::to_string( it.line() - 1 ) + "."
175  );
176  }
177  if( guess_abundances_ ) {
178  auto const la = guess_sequence_abundance( label );
179  sequence.label( la.first );
180  sequence.abundance( la.second );
181  } else {
182  sequence.label( label );
183  }
184 
185  // Check for unexpected end of file.
186  if( *line != '\0' ) {
187  throw std::runtime_error(
188  "Malformed Fasta " + it.source_name()
189  + ": Unexpected characters at the end of the label line in sequence at line "
190  + std::to_string( it.line() - 1 ) + "."
191  );
192  }
193  assert( *line == '\0' );
194 
195  } // End of line scope. We are done with the label line.
196 
197  // Skip comments.
198  while( it && *it == ';' ) {
199  utils::skip_until( it, '\n' );
200  assert( it && *it == '\n' );
201  ++it;
202  }
203 
204  // Check for unexpected end of file.
205  if( !it ) {
206  throw std::runtime_error(
207  "Malformed Fasta " + it.source_name()
208  + ": Expecting a sequence after the label line in sequence at line "
209  + std::to_string( it.line() - 1 ) + "."
210  );
211  }
212  assert( it );
213 
214  // Reserve some tmp memory. We will later copy the content, so that superfluous capacity
215  // is stripped.
216  // We could do a sites.reserve( ... ) here, but this yields only minor speedups.
217  std::string sites;
218  // sites.reserve( n );
219 
220  // Parse sequence. At every beginning of the loop, we are at a line start.
221  while( it && *it != '>' ) {
222  assert( it.column() == 1 );
223 
224  auto line_pair = it.get_line();
225  if( line_pair.second > 0 ) {
226  sites += std::string( line_pair.first, line_pair.second );
227  }
228  }
229  assert( !it || *it == '>' );
230 
231  if( sites.length() == 0 ) {
232  throw std::runtime_error(
233  "Malformed Fasta " + it.source_name() + ": Empty sequence at line "
234  + std::to_string( it.line() - 1 ) + "."
235  );
236  }
237 
238  if( site_casing_ == SiteCasing::kToUpper ) {
239  sequence.sites() = utils::to_upper_ascii( sites );
240  } else if( site_casing_ == SiteCasing::kToLower ) {
241  sequence.sites() = utils::to_lower_ascii( sites );
242  } else {
243  // We could do a move here instead, but this way, we save some memory, which might be
244  // more reasonable for big sequences files than the small gain in speed.
245  // sequence.sites() = std::move(sites);
246  sequence.sites() = sites;
247  }
248 
249  if( use_validation_ ) {
250  for( auto const& c : sequence.sites() ) {
251  if( !lookup_[c] ) {
252  throw std::runtime_error(
253  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
254  + utils::char_to_hex( c, true )
255  + " in sequence near line " + std::to_string( it.line() - 1 ) + "."
256  );
257  }
258  }
259  }
260 
261  return true;
262 }
263 
265  utils::InputStream& input_stream,
266  Sequence& sequence
267 ) const {
268  // Init. Call clear in order to avoid not setting properties that might be added to
269  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
270  // should not change when setting the strings to empty strings.
271  auto& it = input_stream;
272  sequence.clear();
273 
274  // Check for data.
275  if( !it ) {
276  return false;
277  }
278 
279  // Check beginning of sequence.
280  if( it.current() != '>' ) {
281  throw std::runtime_error(
282  "Malformed Fasta " + it.source_name()
283  + ": Expecting '>' at beginning of sequence at " + it.at() + "."
284  );
285  }
286  assert( it && *it == '>' );
287  ++it;
288 
289  // Parse label.
290  std::string label = utils::read_while( it, isprint );
291  if( label == "" ) {
292  throw std::runtime_error(
293  "Malformed Fasta " + it.source_name()
294  + ": Expecting label after '>' at " + it.at() + "."
295  );
296  }
297  if( guess_abundances_ ) {
298  auto const la = guess_sequence_abundance( label );
299  sequence.label( la.first );
300  sequence.abundance( la.second );
301  } else {
302  sequence.label( label );
303  }
304 
305  // Check for unexpected end of stream.
306  if( !it || ( *it != '\n' )) {
307  throw std::runtime_error(
308  "Malformed Fasta " + it.source_name()
309  + ": Expecting a sequence after the label line at " + it.at() + "."
310  );
311  }
312  assert( it && (*it == '\n' ));
313 
314  // Check for unexpected end of file.
315  if( !it || *it != '\n' ) {
316  throw std::runtime_error(
317  "Malformed Fasta " + it.source_name()
318  + ": Expecting a sequence after the label line at " + it.at() + "."
319  );
320  }
321  assert( it && *it == '\n' );
322 
323  // Skip comments.
324  while( it && *it == ';' ) {
325  utils::skip_while( it, isprint );
326  }
327 
328  // Check for unexpected end of file.
329  if( !it || *it != '\n' ) {
330  throw std::runtime_error(
331  "Malformed Fasta " + it.source_name()
332  + ": Expecting a sequence after the label line at " + it.at() + "."
333  );
334  }
335  assert( it && *it == '\n' );
336  ++it;
337 
338  // Parse sequence. At every beginning of the outer loop, we are at a line start.
339  std::string sites;
340  while( it && *it != '>' ) {
341  assert( it.column() == 1 );
342 
343  size_t count = 0;
344  while( it && *it != '\n' ) {
345  char c = *it;
346  if( site_casing_ == SiteCasing::kToUpper ) {
347  c = toupper(c);
348  } else if( site_casing_ == SiteCasing::kToLower ) {
349  c = tolower(c);
350  }
351  if( use_validation_ && ! lookup_[c] ) {
352  throw std::runtime_error(
353  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
354  + utils::char_to_hex( c, true ) + " in sequence at " + it.at() + "."
355  );
356  }
357 
358  sites += c;
359  ++it;
360  ++count;
361  }
362 
363  if( count == 0 ) {
364  throw std::runtime_error(
365  "Malformed Fasta " + it.source_name()
366  + ": Empty sequence line at " + it.at() + "."
367  );
368  }
369 
370  if( !it ) {
371  throw std::runtime_error(
372  "Malformed Fasta " + it.source_name()
373  + ": Sequence line does not end with '\\n' at " + it.at() + "."
374  );
375  }
376  assert( it && *it == '\n' );
377  ++it;
378  }
379  assert( !it || *it == '>' );
380 
381  if( sites.length() == 0 ) {
382  throw std::runtime_error(
383  "Malformed Fasta " + it.source_name()
384  + ": Empty sequence at " + it.at() + "."
385  );
386  }
387 
388  // Copy the sequence. We do not use move here, as we can save some memory this way.
389  sequence.sites() = sites;
390 
391  return true;
392 }
393 
394 // =================================================================================================
395 // Properties
396 // =================================================================================================
397 
399 {
400  parsing_method_ = value;
401  return *this;
402 }
403 
405 {
406  return parsing_method_;
407 }
408 
410 {
411  site_casing_ = value;
412  return *this;
413 }
414 
416 {
417  return site_casing_;
418 }
419 
421 {
422  guess_abundances_ = value;
423  return *this;
424 }
425 
427 {
428  return guess_abundances_;
429 }
430 
431 FastaReader& FastaReader::valid_chars( std::string const& chars )
432 {
433  if( chars.size() == 0 ) {
434  lookup_.set_all( true );
435  use_validation_ = false;
436  } else {
437  lookup_.set_all( false );
438  lookup_.set_selection( chars, true );
439  use_validation_ = true;
440  }
441 
442  return *this;
443 }
444 
445 std::string FastaReader::valid_chars() const
446 {
447  // We need to check the valid chars lookup here, because we don't want to return a string
448  // of _all_ chars.
449  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
450  return "";
451  } else {
452  return lookup_.get_chars_equal_to( true );
453  }
454 }
455 
457 {
458  return lookup_;
459 }
460 
461 } // namespace sequence
462 } // namespace genesis
size_t abundance() const
Definition: sequence.cpp:82
char to_lower_ascii(char c)
Return the lower case of a given char, ascii-only.
Definition: char.hpp:81
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:151
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
void set_all(T value)
Set the lookup status for all chars at once.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Read Fasta sequence data.
std::pair< std::string, size_t > guess_sequence_abundance(Sequence const &sequence)
Guess the abundance of a Sequence, using it's label.
Definition: labels.cpp:70
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void from_string(std::string const &input_string, SequenceSet &sequence_set) const
Read all Sequences from a std::string in Fasta format into a SequenceSet.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
std::string const & sites() const
Definition: sequence.cpp:58
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
Provides some valuable additions to STD.
std::string const & label() const
Definition: sequence.cpp:44
std::string char_to_hex(char c, bool full=false)
Return the hex representation of a char.
Definition: char.hpp:104
bool guess_abundances() const
Return whether the label is used to guess/extracat Sequence abundances.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:214
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
Provides some commonly used string utility functions.
void from_stream(std::istream &input_stream, SequenceSet &sequence_set) const
Read all Sequences from a std::istream in Fasta format into a SequenceSet.
Provides functions for accessing the file system.
Store a set of Sequences.
char to_upper_ascii(char c)
Return the upper case of a given char, ascii-only.
Definition: char.hpp:89
ParsingMethod parsing_method() const
Return the currently set parsing method.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
Definition: scanner.hpp:182
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
void from_file(std::string const &file_name, SequenceSet &sequence_set) const
Read all Sequences from a file in Fasta format into a SequenceSet.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...