A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fasta_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
41 
42 #include <assert.h>
43 #include <cctype>
44 #include <fstream>
45 #include <sstream>
46 #include <stdexcept>
47 
48 namespace genesis {
49 namespace sequence {
50 
51 // =================================================================================================
52 // Constructor and Rule of Five
53 // =================================================================================================
54 
56 {
57  lookup_.set_all( true );
58 }
59 
60 // =================================================================================================
61 // Reading
62 // =================================================================================================
63 
64 void FastaReader::from_stream ( std::istream& input_stream, SequenceSet& sequence_set ) const
65 {
66  // Create an input stream and process it.
67  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( input_stream ));
68  parse_document( it, sequence_set );
69 }
70 
71 SequenceSet FastaReader::from_stream( std::istream& input_stream ) const
72 {
73  // Create a new set and fill it.
74  SequenceSet result;
75  from_stream( input_stream, result );
76  return result;
77 }
78 
79 void FastaReader::from_file ( std::string const& file_name, SequenceSet& sequence_set ) const
80 {
81  // Create an input stream and process it.
82  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
83  parse_document( it, sequence_set );
84 }
85 
86 SequenceSet FastaReader::from_file( std::string const& file_name ) const
87 {
88  // Create a new set and fill it.
89  SequenceSet result;
90  from_file( file_name, result );
91  return result;
92 }
93 
94 void FastaReader::from_string ( std::string const& input_string, SequenceSet& sequence_set ) const
95 {
96  // Create an input stream and process it.
97  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
98  parse_document( it, sequence_set );
99 }
100 
101 SequenceSet FastaReader::from_string( std::string const& input_string ) const
102 {
103  // Create a new set and fill it.
104  SequenceSet result;
105  from_string( input_string, result );
106  return result;
107 }
108 
109 // =================================================================================================
110 // Parsing
111 // =================================================================================================
112 
114  utils::InputStream& input_stream,
115  SequenceSet& sequence_set
116 ) const {
117  Sequence seq;
118 
119  if( parsing_method_ == ParsingMethod::kDefault ) {
120  while( parse_sequence( input_stream, seq ) ) {
121  sequence_set.add( seq );
122  }
123 
124  } else if( parsing_method_ == ParsingMethod::kPedantic ) {
125  while( parse_sequence_pedantic( input_stream, seq ) ) {
126  sequence_set.add( seq );
127  }
128 
129  } else {
130  // There are no other methods currently implemented.
131  assert( false );
132  }
133 }
134 
136  utils::InputStream& input_stream,
137  Sequence& sequence
138 ) const {
139  // Init. Call clear in order to avoid not setting properties that might be added to
140  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
141  // should not change when setting the strings to empty strings.
142  auto& it = input_stream;
143  sequence.clear();
144 
145  // Check for data.
146  if( !it ) {
147  return false;
148  }
149 
150  // Scope to ensure that the label line is only used
151  // while we actually are in that line.
152  {
153 
154  // Get the label line.
155  auto line_pair = it.get_line();
156  char* line = line_pair.first;
157 
158  // Check beginning of sequence.
159  if( line == nullptr || *line != '>' ) {
160  throw std::runtime_error(
161  "Malformed Fasta " + it.source_name()
162  + ": Expecting '>' at beginning of sequence at line " + std::to_string( it.line() - 1 ) + "."
163  );
164  }
165  assert( line && *line == '>' );
166  ++line;
167 
168  // Parse label.
169  std::string label = utils::read_while( line, isprint );
170  if( label == "" ) {
171  throw std::runtime_error(
172  "Malformed Fasta " + it.source_name()
173  + ": Expecting label after '>' in sequence at line " + std::to_string( it.line() - 1 ) + "."
174  );
175  }
176  sequence.label( label );
177 
178  // Check for unexpected end of file.
179  if( *line != '\0' ) {
180  throw std::runtime_error(
181  "Malformed Fasta " + it.source_name()
182  + ": Unexpected characters at the end of the label line in sequence at line "
183  + std::to_string( it.line() - 1 ) + "."
184  );
185  }
186  assert( *line == '\0' );
187 
188  } // End of line scope. We are done with the label line.
189 
190  // Skip comments.
191  while( it && *it == ';' ) {
192  utils::skip_until( it, '\n' );
193  assert( it && *it == '\n' );
194  ++it;
195  }
196 
197  // Check for unexpected end of file.
198  if( !it ) {
199  throw std::runtime_error(
200  "Malformed Fasta " + it.source_name()
201  + ": Expecting a sequence after the label line in sequence at line "
202  + std::to_string( it.line() - 1 ) + "."
203  );
204  }
205  assert( it );
206 
207  // Reserve some tmp memory. We will later copy the content, so that superfluous capacity
208  // is stripped.
209  // We could do a sites.reserve( ... ) here, but this yields only minor speedups.
210  std::string sites;
211  // sites.reserve( n );
212 
213  // Parse sequence. At every beginning of the loop, we are at a line start.
214  while( it && *it != '>' ) {
215  assert( it.column() == 1 );
216 
217  auto line_pair = it.get_line();
218  if( line_pair.second > 0 ) {
219  sites += std::string( line_pair.first, line_pair.second );
220  }
221  }
222  assert( !it || *it == '>' );
223 
224  if( sites.length() == 0 ) {
225  throw std::runtime_error(
226  "Malformed Fasta " + it.source_name() + ": Empty sequence at line "
227  + std::to_string( it.line() - 1 ) + "."
228  );
229  }
230 
231  if( site_casing_ == SiteCasing::kToUpper ) {
232  sequence.sites() = utils::to_upper_ascii( sites );
233  } else if( site_casing_ == SiteCasing::kToLower ) {
234  sequence.sites() = utils::to_lower_ascii( sites );
235  } else {
236  // We could do a move here instead, but this way, we save some memory, which might be
237  // more reasonable for big sequences files than the small gain in speed.
238  // sequence.sites() = std::move(sites);
239  sequence.sites() = sites;
240  }
241 
242  if( use_validation_ ) {
243  for( auto const& c : sequence.sites() ) {
244  if( !lookup_[c] ) {
245  throw std::runtime_error(
246  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
247  + utils::char_to_hex( c, true )
248  + " in sequence near line " + std::to_string( it.line() - 1 ) + "."
249  );
250  }
251  }
252  }
253 
254  return true;
255 }
256 
258  utils::InputStream& input_stream,
259  Sequence& sequence
260 ) const {
261  // Init. Call clear in order to avoid not setting properties that might be added to
262  // Sequence in the future. Should not noticeable affect speed, as the sequence string capacities
263  // should not change when setting the strings to empty strings.
264  auto& it = input_stream;
265  sequence.clear();
266 
267  // Check for data.
268  if( !it ) {
269  return false;
270  }
271 
272  // Check beginning of sequence.
273  if( it.current() != '>' ) {
274  throw std::runtime_error(
275  "Malformed Fasta " + it.source_name()
276  + ": Expecting '>' at beginning of sequence at " + it.at() + "."
277  );
278  }
279  assert( it && *it == '>' );
280  ++it;
281 
282  // Parse label.
283  std::string label = utils::read_while( it, isprint );
284  if( label == "" ) {
285  throw std::runtime_error(
286  "Malformed Fasta " + it.source_name()
287  + ": Expecting label after '>' at " + it.at() + "."
288  );
289  }
290  sequence.label( label );
291 
292  // Check for unexpected end of stream.
293  if( !it || ( *it != '\n' )) {
294  throw std::runtime_error(
295  "Malformed Fasta " + it.source_name()
296  + ": Expecting a sequence after the label line at " + it.at() + "."
297  );
298  }
299  assert( it && (*it == '\n' ));
300 
301  // Check for unexpected end of file.
302  if( !it || *it != '\n' ) {
303  throw std::runtime_error(
304  "Malformed Fasta " + it.source_name()
305  + ": Expecting a sequence after the label line at " + it.at() + "."
306  );
307  }
308  assert( it && *it == '\n' );
309 
310  // Skip comments.
311  while( it && *it == ';' ) {
312  utils::skip_while( it, isprint );
313  }
314 
315  // Check for unexpected end of file.
316  if( !it || *it != '\n' ) {
317  throw std::runtime_error(
318  "Malformed Fasta " + it.source_name()
319  + ": Expecting a sequence after the label line at " + it.at() + "."
320  );
321  }
322  assert( it && *it == '\n' );
323  ++it;
324 
325  // Parse sequence. At every beginning of the outer loop, we are at a line start.
326  std::string sites;
327  while( it && *it != '>' ) {
328  assert( it.column() == 1 );
329 
330  size_t count = 0;
331  while( it && *it != '\n' ) {
332  char c = *it;
333  if( site_casing_ == SiteCasing::kToUpper ) {
334  c = toupper(c);
335  } else if( site_casing_ == SiteCasing::kToLower ) {
336  c = tolower(c);
337  }
338  if( use_validation_ && ! lookup_[c] ) {
339  throw std::runtime_error(
340  "Malformed Fasta " + it.source_name() + ": Invalid sequence symbol "
341  + utils::char_to_hex( c, true ) + " in sequence at " + it.at() + "."
342  );
343  }
344 
345  sites += c;
346  ++it;
347  ++count;
348  }
349 
350  if( count == 0 ) {
351  throw std::runtime_error(
352  "Malformed Fasta " + it.source_name()
353  + ": Empty sequence line at " + it.at() + "."
354  );
355  }
356 
357  if( !it ) {
358  throw std::runtime_error(
359  "Malformed Fasta " + it.source_name()
360  + ": Sequence line does not end with '\\n' at " + it.at() + "."
361  );
362  }
363  assert( it && *it == '\n' );
364  ++it;
365  }
366  assert( !it || *it == '>' );
367 
368  if( sites.length() == 0 ) {
369  throw std::runtime_error(
370  "Malformed Fasta " + it.source_name()
371  + ": Empty sequence at " + it.at() + "."
372  );
373  }
374 
375  // Copy the sequence. We do not use move here, as we can save some memory this way.
376  sequence.sites() = sites;
377 
378  return true;
379 }
380 
381 // =================================================================================================
382 // Properties
383 // =================================================================================================
384 
386 {
387  parsing_method_ = value;
388  return *this;
389 }
390 
392 {
393  return parsing_method_;
394 }
395 
397 {
398  site_casing_ = value;
399  return *this;
400 }
401 
403 {
404  return site_casing_;
405 }
406 
407 FastaReader& FastaReader::valid_chars( std::string const& chars )
408 {
409  if( chars.size() == 0 ) {
410  lookup_.set_all( true );
411  use_validation_ = false;
412  } else {
413  lookup_.set_all( false );
414  lookup_.set_selection( chars, true );
415  use_validation_ = true;
416  }
417 
418  return *this;
419 }
420 
421 std::string FastaReader::valid_chars() const
422 {
423  // We need to check the valid chars lookup here, because we don't want to return a string
424  // of _all_ chars.
425  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
426  return "";
427  } else {
428  return lookup_.get_chars_equal_to( true );
429  }
430 }
431 
433 {
434  return lookup_;
435 }
436 
437 } // namespace sequence
438 } // namespace genesis
char to_lower_ascii(char c)
Return the lower case of a given char, ascii-only.
Definition: char.hpp:81
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:151
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
void set_all(T value)
Set the lookup status for all chars at once.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Read Fasta sequence data.
bool parse_sequence(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
void from_string(std::string const &input_string, SequenceSet &sequence_set) const
Read all Sequences from a std::string in Fasta format into a SequenceSet.
ParsingMethod
Enumeration of the available methods for parsing Fasta sequences.
std::string const & sites() const
Definition: sequence.cpp:58
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
bool parse_sequence_pedantic(utils::InputStream &input_stream, Sequence &sequence) const
Parse a Sequence in Fasta format.
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:373
Provides some valuable additions to STD.
std::string const & label() const
Definition: sequence.cpp:44
std::string char_to_hex(char c, bool full=false)
Return the hex representation of a char.
Definition: char.hpp:104
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:214
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
void parse_document(utils::InputStream &input_stream, SequenceSet &sequence_set) const
Parse a whole fasta document into a SequenceSet.
Provides some commonly used string utility functions.
void from_stream(std::istream &input_stream, SequenceSet &sequence_set) const
Read all Sequences from a std::istream in Fasta format into a SequenceSet.
Provides functions for accessing the file system.
Store a set of Sequences.
char to_upper_ascii(char c)
Return the upper case of a given char, ascii-only.
Definition: char.hpp:89
ParsingMethod parsing_method() const
Return the currently set parsing method.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
FastaReader()
Create a default FastaReader. Per default, chars are turned upper case, but not validated.
void skip_until(InputStream &source, char criterion)
Lexing function that advances the stream until its current char equals the provided one...
Definition: scanner.hpp:182
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
void from_file(std::string const &file_name, SequenceSet &sequence_set) const
Read all Sequences from a file in Fasta format into a SequenceSet.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...