A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
input_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_IO_INPUT_STREAM_H_
2 #define GENESIS_UTILS_IO_INPUT_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2017 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
36 
37 #include <assert.h>
38 #include <stdexcept>
39 #include <string>
40 #include <utility>
41 
42 namespace genesis {
43 namespace utils {
44 
45 // =================================================================================================
46 // Input Stream
47 // =================================================================================================
48 
77 {
78 public:
79 
80  // -------------------------------------------------------------
81  // Member Types
82  // -------------------------------------------------------------
83 
91  static const size_t BlockLength = 1 << 24;
92 
94  using value_type = char;
95 
96  // -------------------------------------------------------------
97  // Constructors and Rule of Five
98  // -------------------------------------------------------------
99 
101  : source_name_( "invalid source" )
102  , buffer_( nullptr )
103  , data_pos_( 0 )
104  , data_end_( 0 )
105  , current_( '\0' )
106  , line_( 0 )
107  , column_( 0 )
108  {}
109 
110  explicit InputStream( std::unique_ptr<BaseInputSource> input_source )
111  : line_( 1 )
112  , column_( 1 )
113  {
114  init_( std::move( input_source ));
115  }
116 
118  {
119  delete[] buffer_;
120  buffer_ = nullptr;
121  }
122 
123  InputStream(self_type const&) = delete;
124  InputStream(self_type&&) = delete;
125 
126  self_type& operator= (self_type const&) = delete;
127  self_type& operator= (self_type&&) = delete;
128 
129  // -------------------------------------------------------------
130  // Char Operations
131  // -------------------------------------------------------------
132 
136  char operator * () const
137  {
138  return current_;
139  }
140 
155  char current() const
156  {
157  if( data_pos_ >= data_end_ ) {
158  throw std::runtime_error(
159  "Unexpected end of " + source_name() + " at " + at() + "."
160  );
161  }
162  if( current_ < 0 ) {
163  throw std::domain_error(
164  "Invalid input char in " + source_name() + " at " + at() + "."
165  );
166  }
167  return current_;
168  }
169 
174  {
175  // If we were already at the end, set counter so zero.
176  if( data_pos_ >= data_end_ ) {
177  reset_();
178  return *this;
179  }
180 
181  // Read data if necessary.
182  update_blocks_();
183 
184  // In case we are moving to a new line, set the counters accordingly.
185  if( current_ == '\n' ) {
186  ++line_;
187  column_ = 1;
188  } else {
189  ++column_;
190  }
191 
192  // Next position.
193  ++data_pos_;
194 
195  // Set the char.
196  set_current_char_();
197  return *this;
198  }
199 
204  {
205  advance();
206  return *this;
207  }
208 
214  char get_char()
215  {
216  char ret = current_;
217  advance();
218  return ret;
219  }
220 
221  // -------------------------------------------------------------
222  // Line Operations
223  // -------------------------------------------------------------
224 
235  std::pair< char*, size_t > get_line()
236  {
237  if( data_pos_ >= data_end_ ) {
238  return { nullptr, 0 };
239  }
240 
241  // Read data if necessary.
242  update_blocks_();
243 
244  // Find the end of the line.
245  size_t line_end = data_pos_;
246  while( line_end != data_end_ &&
247  buffer_[ line_end ] != '\n' &&
248  buffer_[ line_end ] != '\r'
249  ) {
250  ++line_end;
251  }
252 
253  // If the line is too long, throw.
254  if( line_end - data_pos_ + 1 > BlockLength ) {
255  throw std::runtime_error( "Input line too long in " + source_name() + " at " + at() );
256  }
257 
258  // Set the end of the line to \0, so that downstream parses can work with it.
259  if( buffer_[ line_end ] == '\n' ) {
260  buffer_[ line_end ] = '\0';
261 
262  } else if( buffer_[ line_end ] == '\r' ) {
263  buffer_[ line_end ] = '\0';
264 
265  // Treat stupid Windows \r\n lines breaks.
266  if( line_end + 1 < data_end_ && buffer_[ line_end + 1 ] == '\n' ) {
267  ++line_end;
268  buffer_[ line_end ] = '\0';
269  }
270  } else {
271 
272  // Files might be missing the line break at the end of the last line. Add it.
273  assert( line_end == data_end_ );
274  ++data_end_;
275  buffer_[ line_end ] = '\0';
276  }
277 
278  // Get pointer to beginning of the line, and length of the line, for returning it.
279  char* ret_ptr = buffer_ + data_pos_;
280  size_t ret_len = line_end - data_pos_;
281 
282  // Move to the first char of the next line, so that future calls for reading a line or
283  // char start at the right position.
284  data_pos_ = line_end + 1;
285  set_current_char_();
286 
287  // Set counters.
288  ++line_;
289  column_ = 1;
290 
291  return { ret_ptr, ret_len };
292  }
293 
294  // -------------------------------------------------------------
295  // State
296  // -------------------------------------------------------------
297 
303  size_t line() const
304  {
305  return line_;
306  }
307 
314  size_t column() const
315  {
316  return column_;
317  }
318 
323  std::string at() const
324  {
325  return std::to_string( line_ ) + ":" + std::to_string( column_ );
326  }
327 
331  bool good() const
332  {
333  return data_pos_ < data_end_;
334  }
335 
340  explicit operator bool() const
341  {
342  return data_pos_ < data_end_;
343  }
344 
348  bool eof() const
349  {
350  return data_pos_ >= data_end_;
351  }
352 
364  std::string source_name() const
365  {
366  return source_name_;
367  }
368 
369  // -------------------------------------------------------------
370  // Internal Members
371  // -------------------------------------------------------------
372 
373 private:
374 
378  void reset_()
379  {
380  line_ = 0;
381  column_ = 0;
382  current_ = '\0';
383  }
384 
388  void update_blocks_()
389  {
390  // This function is only called from the read char and line methods, which both beforehand
391  // check the following condition. So, if it breaks, this function is invalidly called
392  // from somewhere else.
393  assert( data_pos_ < data_end_ );
394 
395  // If this assertion breaks, someone tempered with our internal invariants.
396  assert( data_end_ <= BlockLength * 2 );
397 
398  // If we are past the first block, we need to load more data into the blocks.
399  if( data_pos_ >= BlockLength ) {
400 
401  // Move the second to the first block.
402  std::memcpy( buffer_, buffer_ + BlockLength, BlockLength );
403  data_pos_ -= BlockLength;
404  data_end_ -= BlockLength;
405 
406  // If we are not yet at the end of the data, start the reader again:
407  // Copy the third block to the second, and read into the third one.
408  if( input_reader_.valid() ) {
409  data_end_ += input_reader_.finish_reading();
410  std::memcpy( buffer_ + BlockLength, buffer_ + 2 * BlockLength, BlockLength );
411  input_reader_.start_reading( buffer_ + 2 * BlockLength, BlockLength );
412  }
413  }
414  }
415 
420  void set_current_char_()
421  {
422  if( data_pos_ >= data_end_ ) {
423  // If we just reached the end, do not fully reset the line and column counters.
424  // They might be needed in some parser.
425  current_ = '\0';
426  return;
427  }
428 
429  // Treat stupid Windows and Mac lines breaks. Set them to \n, so that downstream parsers
430  // don't have to deal with this.
431  if( buffer_[ data_pos_ ] == '\r' ) {
432  buffer_[ data_pos_ ] = '\n';
433 
434  // If this is a Win line break \r\n, skip one of them, so that only a single \n
435  // is visible to the outside.
436  if( data_pos_ + 1 < data_end_ && buffer_[ data_pos_ + 1 ] == '\n' ) {
437  ++data_pos_;
438  }
439  }
440 
441  // If this is the last char of the data, but there is no closing \n, add one.
442  if( data_pos_ + 1 == data_end_ && buffer_[ data_pos_ ] != '\n' ) {
443  ++data_end_;
444  buffer_[ data_pos_ + 1 ] = '\n';
445  }
446 
447  // Set the char.
448  current_ = buffer_[ data_pos_ ];
449  }
450 
454  void init_( std::unique_ptr<BaseInputSource> input_source )
455  {
456  // Set to empty defaults if there is no input.
457  if( input_source == nullptr ) {
458  source_name_ = "invalid source";
459 
460  buffer_ = nullptr;
461  data_pos_ = 0;
462  data_end_ = 0;
463 
464  current_ = '\0';
465  line_ = 0;
466  column_ = 0;
467  return;
468  }
469 
470  // We use three buffer blocks: one and two for the current line. The max line length is
471  // one buffer length, so the beginning of the line is always in the first block, while its
472  // end can reach into the second block, but never exeed it.
473  // The third block is for the async reading.
474  buffer_ = new char[ 3 * BlockLength ];
475 
476  try {
477  // Set source name.
478  source_name_ = input_source->source_name();
479 
480  // Read up to two blocks.
481  data_pos_ = 0;
482  data_end_ = input_source->read( buffer_, 2 * BlockLength );
483 
484  // Skip UTF-8 BOM, if found.
485  if( data_end_ >= 3 &&
486  buffer_[0] == '\xEF' &&
487  buffer_[1] == '\xBB' &&
488  buffer_[2] == '\xBF'
489  ) {
490  data_pos_ = 3;
491  }
492 
493  // If there was no data, set to "empty" values.
494  if( data_pos_ == data_end_ ) {
495  reset_();
496 
497  // If there is data, set char value.
498  } else {
499  set_current_char_();
500  }
501 
502  // If there is more data after the two blocks that we just read, start the
503  // reading process (possibly async, if pthreads is available).
504  if( data_end_ == 2 * BlockLength ) {
505  input_reader_.init( std::move( input_source ));
506  input_reader_.start_reading( buffer_ + 2 * BlockLength, BlockLength );
507  }
508 
509  } catch( ... ) {
510  delete[] buffer_;
511  throw;
512  }
513  }
514 
515  // -------------------------------------------------------------
516  // Data Members
517  // -------------------------------------------------------------
518 
519 private:
520 
521  // Input data comes from here...
522  InputReader input_reader_;
523  std::string source_name_;
524 
525  // ...and is buffered here.
526  char* buffer_;
527  size_t data_pos_;
528  size_t data_end_;
529 
530  // Also, keep track of the current char and position in the data.
531  char current_;
532  size_t line_;
533  size_t column_;
534 };
535 
536 } // namespace utils
537 } // namespace genesis
538 
539 #endif // include guard
bool eof() const
Return true iff the input reached its end.
char current() const
Return the current char, with some checks.
std::pair< char *, size_t > get_line()
Return the current line and move to the beginning of the next.
self_type & advance()
Move to the next char in the stream and advance the counters.
std::string at() const
Return a textual representation of the current input position in the form "line:column".
SynchronousReader InputReader
Alias for the either AsynchronousReader or SynchronousReader, depending on the threading setting...
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:373
char operator*() const
Dereference operator. Return the current char.
InputStream(std::unique_ptr< BaseInputSource > input_source)
bool good() const
Return true iff the input is good (not end of data) and can be read from.
std::string source_name() const
Get the input source name where this stream reads from.
char get_char()
Extract a single char from the input.
static const size_t BlockLength
Block length for internal buffering.
size_t line() const
Return the current line of the input stream.
self_type & operator=(self_type const &)=delete
self_type & operator++()
Move to the next char in the stream. Shortcut for advance().
void start_reading(char *target_buffer, int target_size)
Stream interface for reading data from an InputSource, that keeps track of line and column counters...
void init(std::unique_ptr< BaseInputSource > input_source)
size_t column() const
Return the current column of the input stream.