A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
phylip_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 
41 #include <cassert>
42 #include <cctype>
43 #include <fstream>
44 #include <sstream>
45 #include <stdexcept>
46 
47 namespace genesis {
48 namespace sequence {
49 
50 // =================================================================================================
51 // Constructor and Rule of Five
52 // =================================================================================================
53 
55 {
56  lookup_.set_all( true );
57 }
58 
59 // =================================================================================================
60 // Reading
61 // =================================================================================================
62 
63 void PhylipReader::from_stream( std::istream& input_stream, SequenceSet& sequence_set ) const
64 {
65  utils::InputStream it( utils::make_unique< utils::StreamInputSource >( input_stream ));
66 
67  switch( mode_ ) {
68  case Mode::kSequential:
69  parse_phylip_sequential( it, sequence_set );
70  break;
71 
72  case Mode::kInterleaved:
73  parse_phylip_interleaved( it, sequence_set );
74  break;
75 
76  case Mode::kAutomatic:
77  throw std::runtime_error(
78  "Automatic mode of PhylipReader not possible when using from_stream()."
79  );
80 
81  default:
82  // We already covered all cases above. This cannot happen.
83  assert( false );
84  }
85 }
86 
87 SequenceSet PhylipReader::from_stream( std::istream& input_stream ) const
88 {
89  // Create a new set and fill it.
90  SequenceSet result;
91  from_stream( input_stream, result );
92  return result;
93 }
94 
95 void PhylipReader::from_file( std::string const& file_name, SequenceSet& sequence_set ) const
96 {
97  // This function is very similar to from_string, but has some differences in treating the input
98  // when needing to restart in automatic mode. Unfortunately, we have to live with this code
99  // duplication for now. Everything else would end up in a mess of small helper functions, which
100  // is also not nice.
101 
102  // Get stream.
103  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
104 
105  // If the mode is specified, use it.
106  if( mode_ == Mode::kSequential ) {
107  parse_phylip_sequential( it, sequence_set );
108  } else if( mode_ == Mode::kInterleaved ) {
109  parse_phylip_interleaved( it, sequence_set );
110 
111  // If the mode is automatic, we need to do some magic.
112  } else if( mode_ == Mode::kAutomatic ) {
113 
114  // We need a temporary set, because in case of failure, we need to start from the beginning,
115  // but we do not want to clear all other sequences in the set (that might be there from
116  // other stuff done before calling this function).
117  SequenceSet tmp;
118 
119  // Try sequential first.
120  try {
121  parse_phylip_sequential( it, tmp );
122 
123  // If it fails, restart the stream and try interleaved.
124  // If this also throws, the file is ill formatted and the exception will be forwared to
125  // the caller of this function.
126  } catch ( ... ) {
127  tmp.clear();
128 
129  // Prepare stream. Again. Then parse.
130  utils::InputStream it( utils::make_unique< utils::FileInputSource >( file_name ));
131  parse_phylip_interleaved( it, tmp );
132  }
133 
134  // If we reach this point, one of the parses succeeded.
135  // Move all sequences to the actual target.
136  for( auto s : tmp ) {
137  sequence_set.add( std::move(s) );
138  }
139 
140  } else {
141  // We already covered all cases above. This cannot happen.
142  assert( false );
143  }
144 }
145 
146 SequenceSet PhylipReader::from_file( std::string const& file_name ) const
147 {
148  // Create a new set and fill it.
149  SequenceSet result;
150  from_file( file_name, result );
151  return result;
152 }
153 
154 void PhylipReader::from_string( std::string const& input_string, SequenceSet& sequence_set ) const
155 {
156  // This function is very similar to from_file. See there for some more code explanations and for
157  // the reason why we currently do not re-engineer this to avoid this code duplication.
158 
159  // Prepare stream.
160  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
161 
162  // If the mode is specified, use it.
163  if( mode_ == Mode::kSequential ) {
164  parse_phylip_sequential( it, sequence_set );
165  } else if( mode_ == Mode::kInterleaved ) {
166  parse_phylip_interleaved( it, sequence_set );
167 
168  // If the mode is automatic, we need to do some magic.
169  } else if( mode_ == Mode::kAutomatic ) {
170 
171  // Temporary set.
172  SequenceSet tmp;
173 
174  // Try sequential first.
175  try {
176  parse_phylip_sequential( it, tmp );
177 
178  // If it fails, restart the stream and try interleaved.
179  } catch ( ... ) {
180  tmp.clear();
181 
182  // Prepare stream. Again. Then parse.
183  utils::InputStream it( utils::make_unique< utils::StringInputSource >( input_string ));
184  parse_phylip_interleaved( it, tmp );
185  }
186 
187  // If we reach this point, one of the parses succeeded.
188  // Move all sequences to the actual target.
189  for( auto s : tmp ) {
190  sequence_set.add( std::move(s) );
191  }
192 
193  } else {
194  // We already covered all cases above. This cannot happen.
195  assert( false );
196  }
197 }
198 
199 SequenceSet PhylipReader::from_string( std::string const& input_string ) const
200 {
201  // Create a new set and fill it.
202  SequenceSet result;
203  from_string( input_string, result );
204  return result;
205 }
206 
207 // =================================================================================================
208 // Parsing
209 // =================================================================================================
210 
212 {
213  Header result;
214 
215  // Read number of sequences.
216  utils::skip_while( it, isblank );
217  std::string num_seq_str = utils::read_while( it, isdigit );
218  if( num_seq_str.length() == 0 ) {
219  throw std::runtime_error(
220  "Malformed Phylip " + it.source_name() + ": Expecting sequence number at "
221  + it.at() + "."
222  );
223  }
224  result.num_sequences = std::stoi( num_seq_str );
225 
226  // Read length of sequences.
227  utils::skip_while( it, isblank );
228  std::string len_seq_str = utils::read_while( it, isdigit );
229  if( len_seq_str.length() == 0 ) {
230  throw std::runtime_error(
231  "Malformed Phylip " + it.source_name() + ": Expecting sequence length at "
232  + it.at() + "."
233  );
234  }
235  result.len_sequences = std::stoi( len_seq_str );
236 
237  // Sanity check.
238  if( result.num_sequences == 0 || result.len_sequences == 0 ) {
239  throw std::runtime_error(
240  "Malformed Phylip " + it.source_name() + ": Sequences are empty."
241  );
242  }
243 
244  // Process end of header line and proceed to first non-empty line.
245  utils::skip_while( it, isblank );
247  if( !it || *it != '\n' ) {
248  throw std::runtime_error(
249  "Malformed Phylip " + it.source_name() + ": Expecting end of line at " + it.at() + "."
250  );
251  }
252  utils::skip_while( it, '\n' );
253 
254  return result;
255 }
256 
258 {
259  std::string label;
260 
261  // Labels need to start with some graphical char.
262  if( !it || !isgraph( *it ) ) {
263  throw std::runtime_error(
264  "Malformed Phylip " + it.source_name() + ": Expecting label at " + it.at() + "."
265  );
266  }
267 
268  // Scan label until first blank/tab.
269  if( label_length_ == 0 ) {
270  label = utils::read_while( it, isgraph );
271  if( !it || !isblank( *it ) ) {
272  throw std::runtime_error(
273  "Malformed Phylip " + it.source_name() + ": Expecting delimiting white space at "
274  + it.at() + "."
275  );
276  }
277  utils::skip_while( it, isblank );
278 
279  // Scan label for `label_length` many chars.
280  } else {
281  for( size_t i = 0; i < label_length_; ++i ) {
282  if( !it || !isprint( *it ) ) {
283  throw std::runtime_error(
284  "Malformed Phylip " + it.source_name() + ": Invalid label at " + it.at() + "."
285  );
286  }
287 
288  label += *it;
289  ++it;
290  }
291  }
292 
293  label = utils::trim( label );
294  assert( label.size() > 0 );
295  return label;
296 }
297 
299 {
300  std::string seq;
301 
302  // Process the whole line.
303  while( it && *it != '\n' ) {
304  // Skip all blanks and digits.
305  if( isblank( *it ) || isdigit( *it ) ) {
306  ++it;
307  continue;
308  }
309 
310  // Check and process current char.
311  char c = *it;
312  if( site_casing_ == SiteCasing::kToUpper ) {
313  c = toupper(c);
314  } else if( site_casing_ == SiteCasing::kToLower ) {
315  c = tolower(c);
316  }
317  if( use_validation_ && !lookup_[c] ) {
318  throw std::runtime_error(
319  "Malformed Phylip " + it.source_name() + ": Invalid sequence symbols at "
320  + it.at() + "."
321  );
322  }
323  seq += c;
324  ++it;
325  }
326 
327  // All lines need to end with \n
328  if( !it ) {
329  throw std::runtime_error(
330  "Malformed Phylip " + it.source_name() + ": Sequence line does not end with '\\n' "
331  + it.at() + "."
332  );
333  }
334 
335  assert( it && *it == '\n' );
336  ++it;
337 
338  return seq;
339 }
340 
342 {
343  // Parse header line.
344  auto header = parse_phylip_header( it );
345  size_t num_seq = header.num_sequences;
346  size_t len_seq = header.len_sequences;
347 
348  // Process the given number of sequences. If there are not enough, the inner functions will
349  // throw. If there are too many, the check at the end will throw.
350  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
351  assert( it.column() == 1 );
352  Sequence seq;
353 
354  // Parse label.
355  seq.label( parse_phylip_label( it ) );
356 
357  // Parse sequence. As long as we did not read as many sites as the header claimed, we read
358  // more lines from the input stream. If we then read too many chars (checked in the next
359  // step), the file is ill formatted. This is because a sequence always has to end with \n,
360  // and the label of the next sequence always has to start at the beginning of the line.
361  seq.sites().reserve( len_seq );
362  while( seq.sites().length() < len_seq ) {
363  seq.sites() += parse_phylip_sequence_line( it );
364  assert( it.column() == 1 );
365  }
366 
367  // Check sequence length.
368  if( seq.sites().length() > len_seq ) {
369  throw std::runtime_error(
370  "Malformed Phylip " + it.source_name() + ": Sequence with length "
371  + std::to_string( seq.sites().length() ) + " instead of "
372  + std::to_string( len_seq ) + " at " + it.at() + "."
373  );
374  }
375  assert( seq.sites().length() == len_seq );
376 
377  // Add to set.
378  sset.add( seq );
379  }
380 
381  // Final checks.
382  utils::skip_while( it, isspace );
383  if( it ) {
384  throw std::runtime_error(
385  "Malformed Phylip " + it.source_name() + ": Expected end of file at " + it.at() + "."
386  );
387  }
388  assert( sset.size() == num_seq );
389 }
390 
392 {
393  // Parse header line.
394  auto header = parse_phylip_header( it );
395  size_t num_seq = header.num_sequences;
396  size_t len_seq = header.len_sequences;
397 
398  // Helper function that checks the sequence length and throws if it is too long.
399  auto check_seq_len = [ &it, &len_seq ] ( Sequence const& seq ) {
400  if( seq.length() > len_seq ) {
401  throw std::runtime_error(
402  "Malformed Phylip " + it.source_name() + ": Sequence with length "
403  + std::to_string( seq.sites().length() ) + " instead of "
404  + std::to_string( len_seq ) + " at " + it.at() + "."
405  );
406  }
407  };
408 
409  // Process the first block, which contains the labels.
410  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
411  assert( it.column() == 1 );
412  Sequence seq;
413 
414  // Parse label.
415  seq.label( parse_phylip_label( it ) );
416 
417  // Reserve mem and parse first part of sequence.
418  seq.sites().reserve( len_seq );
419  seq.sites() += parse_phylip_sequence_line( it );
420  check_seq_len( seq );
421 
422  // Add to set.
423  sset.add( seq );
424  }
425 
426  // Helper function that checks whether there are still sequences in the set that are not yet
427  // done (i.e., don't have `len_seq` length).
428  auto unfinished_sequences = [ & ] () {
429  for( auto const& seq : sset ) {
430  assert( seq.length() <= len_seq );
431  if( seq.length() < len_seq ) {
432  return true;
433  }
434  }
435  return false;
436  };
437 
438  while( unfinished_sequences() ) {
439  // Each block might start with an empty line. Skip.
440  if( !it ) {
441  throw std::runtime_error(
442  "Malformed Phylip " + it.source_name() + ": Unexpected end of file at "
443  + it.at() + "."
444  );
445  }
446  if( *it == '\n' ) {
447  ++it;
448  }
449 
450  // Parse the next block.
451  for( size_t seq_n = 0; seq_n < num_seq; ++seq_n ) {
452  assert( it.column() == 1 );
453  sset[seq_n].sites() += parse_phylip_sequence_line( it );
454  check_seq_len( sset[seq_n] );
455  }
456  }
457 
458  assert( sset.size() == num_seq );
459 }
460 
461 // =================================================================================================
462 // Properties
463 // =================================================================================================
464 
466 {
467  mode_ = value;
468  return *this;
469 }
470 
472 {
473  return mode_;
474 }
475 
477 {
478  label_length_ = value;
479  return *this;
480 }
481 
483 {
484  return label_length_;
485 }
486 
488 {
489  site_casing_ = value;
490  return *this;
491 }
492 
494 {
495  return site_casing_;
496 }
497 
498 PhylipReader& PhylipReader::valid_chars( std::string const& chars )
499 {
500  if( chars.size() == 0 ) {
501  lookup_.set_all( true );
502  use_validation_ = false;
503  } else {
504  lookup_.set_all( false );
505  lookup_.set_selection( chars, true );
506  use_validation_ = true;
507  }
508 
509  return *this;
510 }
511 
512 std::string PhylipReader::valid_chars() const
513 {
514  // We need to check the valid chars lookup here, because we don't want to return a string
515  // of _all_ chars.
516  if( ! use_validation_ || lookup_.all_equal_to( true ) ) {
517  return "";
518  } else {
519  return lookup_.get_chars_equal_to( true );
520  }
521 }
522 
524 {
525  return lookup_;
526 }
527 
528 } // namespace sequence
529 } // namespace genesis
void skip_while(InputStream &source, char criterion)
Lexing function that advances the stream while its current char equals the provided one...
Definition: scanner.hpp:151
Read Phylip sequence data.
std::string trim(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with trimmed white spaces.
Definition: string.cpp:352
void from_stream(std::istream &input_stream, SequenceSet &sequence_set) const
Read all Sequences from a std::istream in Phylip format into a SequenceSet.
void set_all(T value)
Set the lookup status for all chars at once.
bool all_equal_to(T comp_value) const
Return whether all chars compare equal to a given value.
Read the data in Phylip sequential mode.
std::string options
Store the options that might be at the end of the header line.
std::string valid_chars() const
Return the currently set chars used for validating Sequence sites.
std::string trim_right(std::string const &s, std::string const &delimiters)
Return a copy of the input string, with left trimmed white spaces.
Definition: string.cpp:328
std::string parse_phylip_label(utils::InputStream &it) const
Parse and return a Phylip label.
std::string at() const
Return a textual representation of the current input position in the form "line:column".
void set_selection(std::string const &chars, T value)
Set the lookup status for all chars that are contained in a given std::string.
std::string read_to_end_of_line(InputStream &source)
Lexing function that reads until the end of the line (i.e., to the new line char), and returns the read chars (excluding the new line char).
Definition: scanner.hpp:132
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:381
Helper that stores the header information of a Phylip file.
size_t label_length() const
Return the currently set label length.
Mode
Enum to distinguish between the different file variants of Phylip. See mode( Mode value ) for more de...
Provides some valuable additions to STD.
std::string const & label() const
Definition: sequence.cpp:44
size_t len_sequences
Length of the sequences in the Phylip file.
size_t num_sequences
Number of sequences in the Phylip file.
std::string read_while(InputStream &source, char criterion)
Lexing function that reads from the stream while its current char equals the provided one...
Definition: scanner.hpp:214
void clear()
Remove all Sequences from the SequenceSet, leaving it with a size() of 0.
SiteCasing site_casing() const
Return whether Sequence sites are automatically turned into upper or lower case.
std::string get_chars_equal_to(T comp_value) const
Return a std::string containg all chars which have lookup status equal to a given value...
Infer the Phylip mode via trial and error.
Provides some commonly used string utility functions.
Header parse_phylip_header(utils::InputStream &it) const
Parse a Phylip header and return the contained sequence count and length.
Provides functions for accessing the file system.
Store a set of Sequences.
std::string source_name() const
Get the input source name where this stream reads from.
reference add(Sequence const &s)
Add a Sequence to the SequenceSet by copying it, and return a reference to it.
utils::CharLookup< bool > & valid_char_lookup()
Return the internal CharLookup that is used for validating the Sequence sites.
std::string parse_phylip_sequence_line(utils::InputStream &it) const
Parse one sequence line.
void parse_phylip_interleaved(utils::InputStream &it, SequenceSet &sset) const
Parse a whole Phylip file using the interleaved variant (Mode::kInterleaved).
Read the data in Phylip interleaved mode.
void from_string(std::string const &input_string, SequenceSet &sequence_set) const
Read all Sequences from a std::string in Phylip format into a SequenceSet.
void from_file(std::string const &file_name, SequenceSet &sequence_set) const
Read all Sequences from a file in Phylip format into a SequenceSet.
void parse_phylip_sequential(utils::InputStream &it, SequenceSet &sset) const
Parse a whole Phylip file using the sequential variant (Mode::kSequential).
SiteCasing
Enumeration of casing methods to apply to each site of a Sequence.
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
PhylipReader()
Create a default PhylipReader. Per default, chars are turned upper case, but not validated.
size_t column() const
Return the current column of the input stream.