A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
phylip_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 
41 #include <cassert>
42 #include <cctype>
43 #include <fstream>
44 #include <sstream>
45 #include <stdexcept>
46 
47 namespace genesis {
48 namespace sequence {
49 
50 // =================================================================================================
51 // Constructor and Rule of Five
52 // =================================================================================================
53 
55 {
56  lookup_.set_all( true );
57 }
58 
59 // =================================================================================================
60 // Reading
61 // =================================================================================================
62 
63 void PhylipReader::from_stream( std::istream& input_stream, SequenceSet& sequence_set ) const
64 {
65  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( input_stream ));
66 
67  switch( mode_ ) {
68  case Mode::kSequential:
69  parse_phylip_sequential( it, sequence_set );
70  break;
71 
72  case Mode::kInterleaved:
73  parse_phylip_interleaved( it, sequence_set );
74  break;
75 
76  case Mode::kAutomatic:
77  throw std::runtime_error(
78  "Automatic mode of PhylipReader not possible when using from_stream()."
79  );
80 
81  default:
82  // We already covered all cases above. This cannot happen.
83  assert( false );
84  }
85 }
86 
87 SequenceSet PhylipReader::from_stream( std::istream& input_stream ) const
88 {
89  // Create a new set and fill it.
90  SequenceSet result;
91  from_stream( input_stream, result );
92  return result;
93 }
94 
95 void PhylipReader::from_file( std::string const& file_name, SequenceSet& sequence_set ) const
96 {
97  // This function is very similar to from_string, but has some differences in treating the input
98  // when needing to restart in automatic mode. Unfortunately, we have to live with this code
99  // duplication for now. Everything else would end up in a mess of small helper functions, which
100  // is also not nice.
101 
102  // Get stream.
103  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
104 
105  // If the mode is specified, use it.
106  if( mode_ == Mode::kSequential ) {
107  parse_phylip_sequential( it, sequence_set );
108  } else if( mode_ == Mode::kInterleaved ) {
109  parse_phylip_interleaved( it, sequence_set );
110 
111  // If the mode is automatic, we need to do some magic.
112  } else if( mode_ == Mode::kAutomatic ) {
113 
114  // We need a temporary set, because in case of failure, we need to start from the beginning,
115  // but we do not want to clear all other sequences in the set (that might be there from
116  // other stuff done before calling this function).
117  SequenceSet tmp;
118 
119  // Try sequential first.
120  try {
121  parse_phylip_sequential( it, tmp );
122 
123  // If it fails, restart the stream and try interleaved.
124  // If this also throws, the file is ill formatted and the exception will be forwared to
125  // the caller of this function.
126  } catch ( ... ) {
127  tmp.clear();
128 
129  // Prepare stream. Again. Then parse.
130  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
131  parse_phylip_interleaved( it, tmp );
132  }
133 
134  // If we reach this point, one of the parses succeeded.
135  // Move all sequences to the actual target.
136  for( auto s : tmp ) {
137  sequence_set.add( std::move(s) );
138  }
139 
140  } else {
141  // We already covered all cases above. This cannot happen.
142  assert( false );
143  }
144 }
145 
146 SequenceSet PhylipReader::from_file( std::string const& file_name ) const
147 {
148  // Create a new set and fill it.
149  SequenceSet result;
150  from_file( file_name, result );
151  return result;
152 }
153 
154 void PhylipReader::from_string( std::string const& input_string, SequenceSet& sequence_set ) const
155 {
156  // This function is very similar to from_file. See there for some more code explanations and for
157  // the reason why we currently do not re-engineer this to avoid this code duplication.
158 
159  // Prepare stream.
160  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
161 
162  // If the mode is specified, use it.
163  if( mode_ == Mode::kSequential ) {
164  parse_phylip_sequential( it, sequence_set );
165  } else if( mode_ == Mode::kInterleaved ) {
166  parse_phylip_interleaved( it, sequence_set );
167 
168  // If the mode is automatic, we need to do some magic.
169  } else if( mode_ == Mode::kAutomatic ) {
170 
171  // Temporary set.
172  SequenceSet tmp;
173 
174  // Try sequential first.
175  try {
176  parse_phylip_sequential( it, tmp );
177 
178  // If it fails, restart the stream and try interleaved.
179  } catch ( ... ) {
180  tmp.clear();
181 
182  // Prepare stream. Again. Then parse.
183  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
184  parse_phylip_interleaved( it, tmp );
185  }
186 
187  // If we reach this point, one of the parses succeeded.
188  // Move all sequences to the actual target.
189  for( auto s : tmp ) {
190  sequence_set.add( std::move(s) );
191  }
192 
193  } else {
194  // We already covered all cases above. This cannot happen.
195  assert( false );
196  }
197 }
198 
199 SequenceSet PhylipReader::from_string( std::string const& input_string ) const
200 {
201  // Create a new set and fill it.
202  SequenceSet result;
203  from_string( input_string, result );
204  return result;
205 }
206 
207 // =================================================================================================
208 // Parsing
209 // =================================================================================================
210 
212 {
213  Header result;
214 
215  // Read number of sequences.
216  utils::skip_while( it, isblank );
217  std::string num_seq_str = utils::read_while( it, isdigit );
218  if( num_seq_str.length() == 0 ) {
219  throw std::runtime_error(
220  "Malformed Phylip " + it.source_name() + ": Expecting sequence number at "
221  + it.at() + "."
222  );
223  }
224  result.num_sequences = std::stoi( num_seq_str );
225 
226  // Read length of sequences.
227  utils::skip_while( it, isblank );
228  std::string len_seq_str = utils::read_while( it, isdigit );
229  if( len_seq_str.length() == 0 ) {
230  throw std::runtime_error(
231  "Malformed Phylip " + it.source_name() + ": Expecting sequence length at "
232  + it.at() + "."
233  );
234  }
235  result.len_sequences = std::stoi( len_seq_str );
236 
237  // Sanity check.
238  if( result.num_sequences == 0 || result.len_sequences == 0 ) {
239  throw std::runtime_error(
240  "Malformed Phylip " + it.source_name() + ": Sequences are empty."
241  );
242  }
243 
244  // Process end of header line and proceed to first non-empty line.
245  utils::skip_while( it, isblank );
247  if( !it || *it != '\n' ) {
248  throw std::runtime_error(
249  "Malformed Phylip " + it.source_name() + ": Expecting end of line at " + it.at() + "."
250  );
251  }
252  utils::skip_while( it, '\n' );
253 
254  return result;
255 }
256 
258 {
259  std::string label;
260 
261  // Labels need to start with some graphical char.
262  if( !it || !isgraph( *it ) ) {
263  throw std::runtime_error(
264  "Malformed Phylip " + it.source_name() + ": Expecting label at " + it.at() + "."
265  );
266  }
267 
268  // Scan label until first blank/tab.
269  if( label_length_ == 0 ) {
270  label = utils::read_while( it, isgraph );
271  if( !it || !isblank( *it ) ) {
272  throw std::runtime_error(
273  "Malformed Phylip " + it.source_name() + ": Expecting delimiting white space at "
274  + it.at() + "."
275  );
276  }
277  utils::skip_while( it, isblank );
278 
279  // Scan label for `label_length` many chars.
280  } else {
281  for( size_t i = 0; i < label_length_; ++i ) {
282  if( !it || !isprint( *it ) ) {
283  throw std::runtime_error(
284  "Malformed Phylip " + it.source_name() + ": Invalid label at " + it.at() + "."
285  );
286  }
287 
288  label += *it;
289  ++it;
290  }
291  }
292 
293  label = utils::trim( label );
294  assert( label.size() > 0 );
295  return label;
296 }
297 
299 {
300  std::string seq;
301 
302  // Process the whole line.
303  while( it && *it != '\n' ) {
304  // Skip all blanks and digits.
305  if( isblank( *it ) || isdigit( *it ) ) {
306  ++it;
307  continue;
308  }
309 
310  // Check and process current char.
311  char c = ( to_upper_ ? toupper(*it) : *it );
312  if( use_validation_ && !lookup_[c] ) {
313  throw std::runtime_error(
314  "Malformed Phylip " + it.source_name() + ": Invalid sequence symbols at "
315  + it.at() + "."
316  );
317  }
318  seq += c;
319  ++it;
320  }
321 
322  // All lines need to end with \n
323  if( !it ) {
324  throw std::runtime_error(
325  "Malformed Phylip " + it.source_name() + ": Sequence line does not end with '\\n' "
326  + it.at() + "."
327  );
328  }
329 
330  assert( it && *it == '\n' );
331  ++it;
332 
333  return seq;
334 }
335 
337 {
338  // Parse header line.
339  auto header = parse_phylip_header( it );
340  size_t num_seq = header.num_sequences;
341  size_t len_seq = header.len_sequences;
342 
343  // Process the given number of sequences. If there are not enough, the inner functions will
344  // throw. If there are too many, the check at the end will throw.
345  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
346  assert( it.column() == 1 );
347  Sequence seq;
348 
349  // Parse label.
350  seq.label( parse_phylip_label( it ) );
351 
352  // Parse sequence. As long as we did not read as many sites as the header claimed, we read
353  // more lines from the input stream. If we then read too many chars (checked in the next
354  // step), the file is ill formatted. This is because a sequence always has to end with \n,
355  // and the label of the next sequence always has to start at the beginning of the line.
356  seq.sites().reserve( len_seq );
357  while( seq.sites().length() < len_seq ) {
358  seq.sites() += parse_phylip_sequence_line( it );
359  assert( it.column() == 1 );
360  }
361 
362  // Check sequence length.
363  if( seq.sites().length() > len_seq ) {
364  throw std::runtime_error(
365  "Malformed Phylip " + it.source_name() + ": Sequence with length "
366  + std::to_string( seq.sites().length() ) + " instead of "
367  + std::to_string( len_seq ) + " at " + it.at() + "."
368  );
369  }
370  assert( seq.sites().length() == len_seq );
371 
372  // Add to set.
373  sset.add( seq );
374  }
375 
376  // Final checks.
377  utils::skip_while( it, isspace );
378  if( it ) {
379  throw std::runtime_error(
380  "Malformed Phylip " + it.source_name() + ": Expected end of file at " + it.at() + "."
381  );
382  }
383  assert( sset.size() == num_seq );
384 }
385 
387 {
388  // Parse header line.
389  auto header = parse_phylip_header( it );
390  size_t num_seq = header.num_sequences;
391  size_t len_seq = header.len_sequences;
392 
393  // Helper function that checks the sequence length and throws if it is too long.
394  auto check_seq_len = [ &it, &len_seq ] ( Sequence const& seq ) {
395  if( seq.length() > len_seq ) {
396  throw std::runtime_error(
397  "Malformed Phylip " + it.source_name() + ": Sequence with length "
398  + std::to_string( seq.sites().length() ) + " instead of "
399  + std::to_string( len_seq ) + " at " + it.at() + "."
400  );
401  }
402  };
403 
404  // Process the first block, which contains the labels.
405  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
406  assert( it.column() == 1 );
407  Sequence seq;
408 
409  // Parse label.
410  seq.label( parse_phylip_label( it ) );
411 
412  // Reserve mem and parse first part of sequence.
413  seq.sites().reserve( len_seq );
414  seq.sites() += parse_phylip_sequence_line( it );
415  check_seq_len( seq );
416 
417  // Add to set.
418  sset.add( seq );
419  }
420 
421  // Helper function that checks whether there are still sequences in the set that are not yet
422  // done (i.e., don't have `len_seq` length).
423  auto unfinished_sequences = [ & ] () {
424  for( auto const& seq : sset ) {
425  assert( seq.length() <= len_seq );
426  if( seq.length() < len_seq ) {
427  return true;
428  }
429  }
430  return false;
431  };
432 
433  while( unfinished_sequences() ) {
434  // Each block might start with an empty line. Skip.
435  if( !it ) {
436  throw std::runtime_error(
437  "Malformed Phylip " + it.source_name() + ": Unexpected end of file at "
438  + it.at() + "."
439  );
440  }
441  if( *it == '\n' ) {
442  ++it;
443  }
444 
445  // Parse the next block.
446  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
447  assert( it.column() == 1 );
448  sset[seq_n].sites() += parse_phylip_sequence_line( it );
449  check_seq_len( sset[seq_n] );
450  }
451  }
452 
453  assert( sset.size() == num_seq );
454 }
455 
456 // =================================================================================================
457 // Properties
458 // =================================================================================================
459 
461 {
462  mode_ = value;
463  return *this;
464 }
465 
467 {
468  return mode_;
469 }
470 
472 {
473  label_length_ = value;
474  return *this;
475 }
476 
478 {
479  return label_length_;
480 }
481 
483 {
484  to_upper_ = value;
485  return *this;
486 }
487 
489 {
490  return to_upper_;
491 }
492 
493 PhylipReader& PhylipReader::valid_chars( std::string const& chars )
494 {
495  if( chars.size() == 0 ) {
496  lookup_.set_all( true );
497  use_validation_ = false;
498  } else {
499  lookup_.set_all( false );
500  lookup_.set_selection( chars, true );
501  use_validation_ = true;
502  }
503 
504  return *this;
505 }
506 
507 std::string PhylipReader::valid_chars() const
508 {
509  // We need to check the valid chars lookup here, because we don't want to return a string
510  // of _all_ chars.
511  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
512  return "";
513  } else {
514  return lookup_.get_chars_equal_to( true );
515  }
516 }
517 
519 {
520  return lookup_;
521 }
522 
523 } // namespace sequence
524 } // namespace genesis
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:151
Read Phylip sequence data.
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:238
void from_stream(std::istream &input_stream, SequenceSet &sequence_set) const
Read all Sequences from a std::istream in Phylip format into a SequenceSet.
void set_all(T value)
Set the lookup status for all chars at once.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Read the data in Phylip sequential mode.
std::string options
Store the options that might be at the end of the header line.
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:214
std::string parse_phylip_label(utils::InputStream &it) const
Parse and return a Phylip label.
std::string at() const
Return a textual representation of the current input position in the form "line:column".
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
std::string read_to_end_of_line(InputStream &source)
Lexing function that reads until the end of the line (i.e., to the new line char), and returns the read chars (excluding the new line char).
Definition: scanner.hpp:132
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
Helper that stores the header information of a Phylip file.
bool to_upper() const
Return whether Sequence sites are automatically turned into upper case.
size_t label_length() const
Return the currently set label length.
Mode
Enum to distinguish between the different file variants of Phylip. See mode( Mode value ) for more de...
Provides some valuable additions to STD.
std::string const & label() const
Definition: sequence.cpp:44
size_t len_sequences
Length of the sequences in the Phylip file.
size_t num_sequences
Number of sequences in the Phylip file.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:214
void clear()
Remove all Sequences from the SequenceSet, leaving it with a size() of 0.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
Infer the Phylip mode via trial and error.
Provides some commonly used string utility functions.
Header parse_phylip_header(utils::InputStream &it) const
Parse a Phylip header and return the contained sequence count and length.
Provides functions for accessing the file system.
Store a set of Sequences.
std::string source_name() const
Get the input source name where this stream reads from.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
std::string parse_phylip_sequence_line(utils::InputStream &it) const
Parse one sequence line.
void parse_phylip_interleaved(utils::InputStream &it, SequenceSet &sset) const
Parse a whole Phylip file using the sequential variant (Mode::kSequential).
Read the data in Phylip interleaved mode.
void from_string(std::string const &input_string, SequenceSet &sequence_set) const
Read all Sequences from a std::string in Phylip format into a SequenceSet.
void from_file(std::string const &file_name, SequenceSet &sequence_set) const
Read all Sequences from a file in Phylip format into a SequenceSet.
void parse_phylip_sequential(utils::InputStream &it, SequenceSet &sset) const
Parse a whole Phylip file using the interleaved variant (Mode::kInterleaved).
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
PhylipReader()
Create a default PhylipReader. Per default, chars are turned upper case, but not validated.
size_t column() const
Return the current column of the input stream.