A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fasta_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 
41 #include <assert.h>
42 #include <cctype>
43 #include <fstream>
44 #include <sstream>
45 #include <stdexcept>
46 
47 namespace genesis {
48 namespace sequence {
49 
50 // =================================================================================================
51 // Constructor and Rule of Five
52 // =================================================================================================
53 
55 {
56  lookup_.set_all( true );
57 }
58 
59 // =================================================================================================
60 // Reading
61 // =================================================================================================
62 
63 void FastaReader::from_stream ( std::istream& input_stream, SequenceSet& sequence_set ) const
64 {
65  // Create an input stream and process it.
66  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( input_stream ));
67  parse_document( it, sequence_set );
68 }
69 
70 SequenceSet FastaReader::from_stream( std::istream& input_stream ) const
71 {
72  // Create a new set and fill it.
73  SequenceSet result;
74  from_stream( input_stream, result );
75  return result;
76 }
77 
78 void FastaReader::from_file ( std::string const& file_name, SequenceSet& sequence_set ) const
79 {
80  // Create an input stream and process it.
81  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
82  parse_document( it, sequence_set );
83 }
84 
85 SequenceSet FastaReader::from_file( std::string const& file_name ) const
86 {
87  // Create a new set and fill it.
88  SequenceSet result;
89  from_file( file_name, result );
90  return result;
91 }
92 
93 void FastaReader::from_string ( std::string const& input_string, SequenceSet& sequence_set ) const
94 {
95  // Create an input stream and process it.
96  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
97  parse_document( it, sequence_set );
98 }
99 
100 SequenceSet FastaReader::from_string( std::string const& input_string ) const
101 {
102  // Create a new set and fill it.
103  SequenceSet result;
104  from_string( input_string, result );
105  return result;
106 }
107 
108 // =================================================================================================
109 // Parsing
110 // =================================================================================================
111 
113  utils::InputStream& input_stream,
114  SequenceSet& sequence_set
115 ) const {
116  Sequence seq;
117 
118  if( parsing_method_ == ParsingMethod::kDefault ) {
119  while( parse_sequence( input_stream, seq ) ) {
120  sequence_set.add( seq );
121  }
122 
123  } else if( parsing_method_ == ParsingMethod::kPedantic ) {
124  while( parse_sequence_pedantic( input_stream, seq ) ) {
125  sequence_set.add( seq );
126  }
127 
128  } else {
129  // There are no other methods currently implemented.
130  assert( false );
131  }
132 }
133 
135  utils::InputStream& input_stream,
136  Sequence& sequence
137 ) const {
138  // Init. Call clear in order to avoid not setting properties that might be added to
139  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
140  // should not change when setting the strings to empty strings.
141  auto& it = input_stream;
142  sequence.clear();
143 
144  // Check for data.
145  if( !it ) {
146  return false;
147  }
148 
149  // We will do whole line reading in this parser. Thus, when reporting an error within a line,
150  // the input stream will already be at a different position. We can thus only report the
151  // beginning of the sequence where the error occured, but not the actual position.
152  // If we wanted to do this, we'd need more bookkeeping and awkward helper functions.
153  // Use parse_sequence_pedantic() instead in cases where you need exact error reporting.
154  auto seq_at = it.at();
155 
156  // Scope to ensure that the label and metadata line is only used
157  // while we actually are in that line.
158  {
159 
160  // Get the label and metadata line.
161  auto line_pair = it.get_line();
162  char* line = line_pair.first;
163 
164  // Check beginning of sequence.
165  if( line == nullptr || *line != '>' ) {
166  throw std::runtime_error(
167  "Malformed Fasta " + it.source_name()
168  + ": Expecting '>' at beginning of sequence at " + seq_at + "."
169  );
170  }
171  assert( line && *line == '>' );
172  ++line;
173 
174  // Parse label.
175  std::string label = utils::read_while( line, isgraph );
176  if( label == "" ) {
177  throw std::runtime_error(
178  "Malformed Fasta " + it.source_name()
179  + ": Expecting label after '>' in sequence at " + seq_at + "."
180  );
181  }
182  sequence.label( label );
183 
184  // Check for unexpected end of stream.
185  if( *line != '\0' && *line != ' ' ) {
186  throw std::runtime_error(
187  "Malformed Fasta " + it.source_name()
188  + ": Expecting a sequence after the label line in sequence at " + seq_at + "."
189  );
190  }
191  assert( *line == '\0' || *line == ' ' );
192 
193  // Parse metadata.
194  std::string metadata;
195  if( *line == ' ' ) {
196  ++line;
197  metadata = utils::read_while( line, isprint );
198  }
199  sequence.metadata( metadata );
200 
201  // Check for unexpected end of file.
202  if( *line != '\0' ) {
203  throw std::runtime_error(
204  "Malformed Fasta " + it.source_name()
205  + ": Unexpected characters at the end of the label line in sequence at " + seq_at + "."
206  );
207  }
208  assert( *line == '\0' );
209 
210  } // End of line scope. We are done with the label and metadata line.
211 
212  // Skip comments.
213  while( it && *it == ';' ) {
214  utils::skip_until( it, '\n' );
215  assert( it && *it == '\n' );
216  ++it;
217  }
218 
219  // Check for unexpected end of file.
220  if( !it ) {
221  throw std::runtime_error(
222  "Malformed Fasta " + it.source_name()
223  + ": Expecting a sequence after the label line in sequence at " + seq_at + "."
224  );
225  }
226  assert( it );
227 
228  // Reserve some tmp memory. We will later copy the content, so that superfluous capacity
229  // is stripped.
230  // We could do a sites.reserve( ... ) here, but this yields only minor speedups.
231  std::string sites;
232  // sites.reserve( n );
233 
234  // Parse sequence. At every beginning of the loop, we are at a line start.
235  while( it && *it != '>' ) {
236  assert( it.column() == 1 );
237 
238  auto line_pair = it.get_line();
239  if( line_pair.second == 0 ) {
240  throw std::runtime_error(
241  "Malformed Fasta " + it.source_name()
242  + ": Empty sequence line in sequence at " + seq_at + "."
243  );
244  }
245 
246  sites += std::string( line_pair.first, line_pair.second );
247  }
248  assert( !it || *it == '>' );
249 
250  if( sites.length() == 0 ) {
251  throw std::runtime_error(
252  "Malformed Fasta " + it.source_name() + ": Empty sequence at " + seq_at + "."
253  );
254  }
255 
256  if( to_upper_ ) {
257  sequence.sites() = utils::to_upper_ascii( sites );
258  } else {
259  // We could do a move here instead, but this way, we save some memory, which might be
260  // more reasonable for big sequences files than the small gain in speed.
261  // sequence.sites() = std::move(sites);
262  sequence.sites() = sites;
263  }
264 
265  if( use_validation_ ) {
266  for( auto const& c : sequence.sites() ) {
267  if( !lookup_[c] ) {
268  throw std::runtime_error(
269  "Malformed Fasta " + it.source_name()
270  + ": Invalid sequence symbols in sequence at " + seq_at + "."
271  );
272  }
273  }
274  }
275 
276  return true;
277 }
278 
280  utils::InputStream& input_stream,
281  Sequence& sequence
282 ) const {
283  // Init. Call clear in order to avoid not setting properties that might be added to
284  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
285  // should not change when setting the strings to empty strings.
286  auto& it = input_stream;
287  sequence.clear();
288 
289  // Check for data.
290  if( !it ) {
291  return false;
292  }
293 
294  // Check beginning of sequence.
295  if( it.current() != '>' ) {
296  throw std::runtime_error(
297  "Malformed Fasta " + it.source_name()
298  + ": Expecting '>' at beginning of sequence at " + it.at() + "."
299  );
300  }
301  assert( it && *it == '>' );
302  ++it;
303 
304  // Parse label.
305  std::string label = utils::read_while( it, isgraph );
306  if( label == "" ) {
307  throw std::runtime_error(
308  "Malformed Fasta " + it.source_name()
309  + ": Expecting label after '>' at " + it.at() + "."
310  );
311  }
312  sequence.label( label );
313 
314  // Check for unexpected end of stream.
315  if( !it || ( *it != '\n' && *it != ' ' )) {
316  throw std::runtime_error(
317  "Malformed Fasta " + it.source_name()
318  + ": Expecting a sequence after the label line at " + it.at() + "."
319  );
320  }
321  assert( it && (*it == '\n' || *it == ' ') );
322 
323  // Parse metadata.
324  std::string metadata;
325  if( *it == ' ' ) {
326  ++it;
327  metadata = utils::read_while( it, isprint );
328  }
329  sequence.metadata( metadata );
330 
331  // Check for unexpected end of file.
332  if( !it || *it != '\n' ) {
333  throw std::runtime_error(
334  "Malformed Fasta " + it.source_name()
335  + ": Expecting a sequence after the label line at " + it.at() + "."
336  );
337  }
338  assert( it && *it == '\n' );
339 
340  // Skip comments.
341  while( it && *it == ';' ) {
342  utils::skip_while( it, isprint );
343  }
344 
345  // Check for unexpected end of file.
346  if( !it || *it != '\n' ) {
347  throw std::runtime_error(
348  "Malformed Fasta " + it.source_name()
349  + ": Expecting a sequence after the label line at " + it.at() + "."
350  );
351  }
352  assert( it && *it == '\n' );
353  ++it;
354 
355  // Parse sequence. At every beginning of the outer loop, we are at a line start.
356  std::string sites;
357  while( it && *it != '>' ) {
358  assert( it.column() == 1 );
359 
360  size_t count = 0;
361  while( it && *it != '\n' ) {
362  char c = ( to_upper_ ? toupper(*it) : *it );
363  if( use_validation_ && ! lookup_[c] ) {
364  throw std::runtime_error(
365  "Malformed Fasta " + it.source_name()
366  + ": Invalid sequence symbols at " + it.at() + "."
367  );
368  }
369 
370  sites += c;
371  ++it;
372  ++count;
373  }
374 
375  if( count == 0 ) {
376  throw std::runtime_error(
377  "Malformed Fasta " + it.source_name()
378  + ": Empty sequence line at " + it.at() + "."
379  );
380  }
381 
382  if( !it ) {
383  throw std::runtime_error(
384  "Malformed Fasta " + it.source_name()
385  + ": Sequence line does not end with '\\n' at " + it.at() + "."
386  );
387  }
388  assert( it && *it == '\n' );
389  ++it;
390  }
391  assert( !it || *it == '>' );
392 
393  if( sites.length() == 0 ) {
394  throw std::runtime_error(
395  "Malformed Fasta " + it.source_name()
396  + ": Empty sequence at " + it.at() + "."
397  );
398  }
399 
400  // Copy the sequence. We do not use move here, as we can save some memory this way.
401  sequence.sites() = sites;
402 
403  return true;
404 }
405 
406 // =================================================================================================
407 // Properties
408 // =================================================================================================
409 
411 {
412  parsing_method_ = value;
413  return *this;
414 }
415 
417 {
418  return parsing_method_;
419 }
420 
422 {
423  to_upper_ = value;
424  return *this;
425 }
426 
428 {
429  return to_upper_;
430 }
431 
432 FastaReader& FastaReader::valid_chars( std::string const& chars )
433 {
434  if( chars.size() == 0 ) {
435  lookup_.set_all( true );
436  use_validation_ = false;
437  } else {
438  lookup_.set_all( false );
439  lookup_.set_selection( chars, true );
440  use_validation_ = true;
441  }
442 
443  return *this;
444 }
445 
446 std::string FastaReader::valid_chars() const
447 {
448  // We need to check the valid chars lookup here, because we don't want to return a string
449  // of _all_ chars.
450  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
451  return "";
452  } else {
453  return lookup_.get_chars_equal_to( true );
454  }
455 }
456 
458 {
459  return lookup_;
460 }
461 
462 } // namespace sequence
463 } // namespace genesis
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:151
void set_all(T value)
Set the lookup status for all chars at once.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Read Fasta sequence data.
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void from_string(std::string const &input_string, SequenceSet &sequence_set) const
Read all Sequences from a std::string in Fasta format into a SequenceSet.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
std::string const & sites() const
Definition: sequence.cpp:72
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
Provides some valuable additions to STD.
std::string const & label() const
Definition: sequence.cpp:44
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:214
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
Provides some commonly used string utility functions.
void from_stream(std::istream &input_stream, SequenceSet &sequence_set) const
Read all Sequences from a std::istream in Fasta format into a SequenceSet.
Provides functions for accessing the file system.
Store a set of Sequences.
ParsingMethod parsing_method() const
Return the currently set parsing method.
bool to_upper() const
Return whether Sequence sites are automatically turned into upper case.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
std::string const & metadata() const
Definition: sequence.cpp:58
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
Definition: scanner.hpp:182
char to_upper_ascii(char c)
Return the upper case of a given char, ascii-only.
Definition: string.cpp:286
void from_file(std::string const &file_name, SequenceSet &sequence_set) const
Read all Sequences from a file in Fasta format into a SequenceSet.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...