A library for working with phylogenetic and population genetic data.
v0.27.0
input_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_IO_INPUT_STREAM_H_
2 #define GENESIS_UTILS_IO_INPUT_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2022 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
38 
39 #include <cassert>
40 #include <cstdint>
41 #include <functional>
42 #include <memory>
43 #include <stdexcept>
44 #include <string>
45 #include <utility>
46 
47 namespace genesis {
48 namespace utils {
49 
50 // =================================================================================================
51 // Input Stream
52 // =================================================================================================
53 
82 {
83 public:
84 
85  // -------------------------------------------------------------------------
86  // Member Types
87  // -------------------------------------------------------------------------
88 
94  static const size_t BlockLength = 1 << 22;
95 
97  using value_type = char;
98 
99  // -------------------------------------------------------------------------
100  // Constructors and Rule of Five
101  // -------------------------------------------------------------------------
102 
104  : source_name_( "invalid source" )
105  , buffer_( nullptr )
106  , data_pos_( 0 )
107  , data_end_( 0 )
108  , current_( '\0' )
109  , line_( 0 )
110  , column_( 0 )
111  {}
112 
113  explicit InputStream( std::shared_ptr<BaseInputSource> input_source )
114  : line_( 1 )
115  , column_( 1 )
116  {
117  init_( input_source );
118  }
119 
121  {
122  delete[] buffer_;
123  buffer_ = nullptr;
124  }
125 
126  InputStream( self_type const& ) = delete;
128  : buffer_( nullptr )
129  {
130  *this = std::move( other );
131  }
132 
133  self_type& operator= ( self_type const& ) = delete;
134  self_type& operator= ( self_type&& other );
135 
136  // -------------------------------------------------------------------------
137  // Stream Iterator Operations
138  // -------------------------------------------------------------------------
139 
143  inline char operator* () const
144  {
145  return current_;
146  }
147 
162  inline char current() const
163  {
164  if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {
165  throw std::runtime_error(
166  "Unexpected end of " + source_name() + " at " + at() + "."
167  );
168  }
169  if( current_ < 0 ) GENESIS_UNLIKELY {
170  throw std::domain_error(
171  "Invalid input char in " + source_name() + " at " + at() + "."
172  );
173  }
174  return current_;
175  }
176 
180  inline self_type& advance()
181  {
182  operator++();
183  return *this;
184  }
185 
190  {
191  // If we were already at the end, set counter so zero.
192  if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {
193  reset_();
194  return *this;
195  }
196 
197  // Read data if necessary.
198  if( data_pos_ >= BlockLength ) GENESIS_UNLIKELY {
199  update_blocks_();
200  }
201  assert( data_pos_ < BlockLength );
202 
203  // In case we are moving to a new line, set the counters accordingly.
204  if( current_ == '\n' ) {
205  ++line_;
206  column_ = 1;
207  } else {
208  ++column_;
209  }
210 
211  // Next position.
212  ++data_pos_;
213 
214  // Set the char.
215  set_current_char_();
216  return *this;
217  }
218 
224  inline char get_char()
225  {
226  char ret = current_;
227  operator++();
228  return ret;
229  }
230 
231  // -------------------------------------------------------------------------
232  // Line Operations
233  // -------------------------------------------------------------------------
234 
243  void get_line( std::string& target );
244 
252  std::string get_line()
253  {
254  std::string result;
255  get_line( result );
256  return result;
257  }
258 
259  // -------------------------------------------------------------------------
260  // Char Operations
261  // -------------------------------------------------------------------------
262 
271  inline char read_char_or_throw( char const criterion )
272  {
273  // Check char and move to next.
274  if( data_pos_ >= data_end_ || current_ != criterion ) GENESIS_UNLIKELY {
275  throw std::runtime_error(
276  std::string("In ") + source_name() + ": " +
277  "Expecting " + char_to_hex( criterion ) + " at " + at() + ", " +
278  "but received " + char_to_hex( current_ ) + " instead."
279  );
280  }
281  assert( good() && current_ == criterion );
282  operator++();
283  return criterion;
284  }
285 
294  inline char read_char_or_throw( std::function<bool (char)> criterion )
295  {
296  // Check char and move to next.
297  if( data_pos_ >= data_end_ || !criterion( current_ )) GENESIS_UNLIKELY {
298  throw std::runtime_error(
299  std::string("In ") + source_name() + ": " +
300  "Unexpected char " + char_to_hex( current_ ) + " at " + at() + "."
301  );
302  }
303  assert( good() );
304  auto const chr = current_;
305  operator++();
306  return chr;
307  }
308 
309  // -------------------------------------------------------------------------
310  // Parsers
311  // -------------------------------------------------------------------------
312 
313 private:
314 
315  // Only use intrinsics version for the compilers that support them!
316  #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
317 
322  size_t parse_unsigned_integer_gcc_intrinsic_();
323 
328  size_t parse_unsigned_integer_from_chars_();
329 
330  #endif
331 
332  // Only use C++17 code if we are compiled with that version.
333  #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
334 
339  size_t parse_unsigned_integer_std_from_chars_();
340 
341  #endif
342 
346  size_t parse_unsigned_integer_naive_();
347 
351  size_t parse_unsigned_integer_size_t_();
352 
353 public:
354 
365  template<class T>
367  {
368  // No need to assert unsignedness here. We will later check that casting to the desired
369  // type worked, and we test for the correct sign there as well, so that it workes for
370  // signed types.
371  // static_assert(
372  // std::is_unsigned<T>::value,
373  // "Need unsigned type for parse_unsigned_integer()"
374  // );
375 
376  auto const x = parse_unsigned_integer_size_t_();
377 
378  // We parsed as largest, and now try to cast to desired type,
379  // testing that back-conversion gives the same value and correct sign.
380  auto const r = static_cast<T>(x);
381  if( static_cast<size_t>(r) != x || r < 0 ) {
382  throw std::overflow_error(
383  "Numerical overflow in " + source_name() + " at " + at() + "."
384  );
385  }
386  return r;
387  }
388 
398  template<class T>
400  {
401  static_assert(
402  std::is_signed<T>::value,
403  "Need signed type for parse_signed_integer()"
404  );
405 
406  if( data_pos_ >= data_end_ ) {
407  throw std::runtime_error(
408  "Expecting number in " + source_name() + " at " + at() + "."
409  );
410  }
411 
412  int sign = 1;
413  if( current_ == '-' || current_ == '+' ) {
414  if( current_ == '-' ) {
415  sign = -1;
416  }
417 
418  // Here we know that we are within limits of the buffer, and not at a new line.
419  // Let's not use the expensive operator then.
420  // operator++();
421  assert( data_pos_ < data_end_ );
422  assert( current_ != '\n' );
423  ++column_;
424  ++data_pos_;
425  current_ = buffer_[ data_pos_ ];
426  }
427 
428  // Parse as largest, and then try to cast to desired type,
429  // testing that back-conversion gives the same value and sign.
430  // The back-cast of `sign * r` is always valid, as the negative range of signed ints
431  // is smaller than the positive, so if it's negative, multiplying by -1 will always result
432  // in a valid value.
433  auto const x = parse_unsigned_integer_size_t_();
434  auto const r = sign * static_cast<T>(x);
435  if( static_cast<size_t>( sign * r ) != x || !( r == 0 || (sign < 0) == (r < 0) )) {
436  throw std::overflow_error(
437  "Numerical overflow in " + source_name() + " at " + at() + "."
438  );
439  }
440  return r;
441  }
442 
446  template<class T>
448  {
449  return parse_signed_integer<T>();
450  }
451 
452  // -------------------------------------------------------------------------
453  // State
454  // -------------------------------------------------------------------------
455 
461  size_t line() const
462  {
463  return line_;
464  }
465 
472  size_t column() const
473  {
474  return column_;
475  }
476 
481  std::string at() const
482  {
483  return std::to_string( line_ ) + ":" + std::to_string( column_ );
484  }
485 
489  inline bool good() const
490  {
491  return data_pos_ < data_end_;
492  }
493 
498  inline explicit operator bool() const
499  {
500  return data_pos_ < data_end_;
501  }
502 
506  inline bool eof() const
507  {
508  return data_pos_ >= data_end_;
509  }
510 
522  std::string source_name() const
523  {
524  return source_name_;
525  }
526 
545  std::pair<char const*, size_t> buffer()
546  {
547  assert( data_pos_ <= data_end_ );
548  return { &buffer_[ data_pos_ ], data_end_ - data_pos_ };
549  }
550 
561  void jump_unchecked( size_t n )
562  {
563  // Safety first!
564  if( data_pos_ + n >= data_end_ ) {
565  throw std::runtime_error(
566  "Invalid InputStream jump to position after buffer end."
567  );
568  }
569 
570  // Update the position as neeeded.
571  data_pos_ += n;
572  column_ += n;
573  if( data_pos_ >= BlockLength ) {
574  update_blocks_();
575  }
576  set_current_char_();
577  }
578 
579  // -------------------------------------------------------------------------
580  // Internal Members
581  // -------------------------------------------------------------------------
582 
583 private:
584 
588  void init_( std::shared_ptr<BaseInputSource> input_source );
589 
593  void reset_()
594  {
595  line_ = 0;
596  column_ = 0;
597  current_ = '\0';
598  }
599 
603  void update_blocks_();
604 
609  inline void set_current_char_()
610  {
611  // Check end of stream conditions.
612  if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {
613  // We do not expect to overshoot. Let's assert this, but if it still happens
614  // (in release build), we can also cope, and will just set \0 as the current char.
615  assert( data_pos_ == data_end_ );
616 
617  if( data_pos_ == data_end_ && data_pos_ > 0 && buffer_[ data_pos_ - 1 ] != '\n' ) {
618  // If this is the end of the data, but there was no closing \n, add one.
619  buffer_[ data_pos_ ] = '\n';
620  ++data_end_;
621  } else {
622  // If we reached the end, do not fully reset the line and column counters.
623  // They might be needed in some parser.
624  current_ = '\0';
625  return;
626  }
627  }
628 
629  // Treat stupid Windows and Mac lines breaks. Set them to \n, so that downstream parsers
630  // don't have to deal with this.
631  if( buffer_[ data_pos_ ] == '\r' ) {
632  buffer_[ data_pos_ ] = '\n';
633 
634  // If this is a Win line break \r\n, skip one of them, so that only a single \n
635  // is visible to the outside. We do not treat \n\r line breaks properly here!
636  // If any system still uses those, we'd have to change code here.
637  if( data_pos_ + 1 < data_end_ && buffer_[ data_pos_ + 1 ] == '\n' ) {
638  ++data_pos_;
639  }
640  }
641 
642  // Set the char.
643  current_ = buffer_[ data_pos_ ];
644  }
645 
646  // -------------------------------------------------------------------------
647  // Data Members
648  // -------------------------------------------------------------------------
649 
650 private:
651 
652  // Input data comes from here...
653  // (we use a unique ptr to make the class movable)
654  std::unique_ptr<InputReader> input_reader_ = nullptr;
655  std::string source_name_;
656 
657  // ...and is buffered here.
658  char* buffer_;
659  size_t data_pos_;
660  size_t data_end_;
661 
662  // Also, keep track of the current char and position in the data.
663  char current_;
664  size_t line_;
665  size_t column_;
666 };
667 
668 // =================================================================================================
669 // Template Specializations
670 // =================================================================================================
671 
679 template<>
680 inline size_t InputStream::parse_unsigned_integer<size_t>()
681 {
682  return parse_unsigned_integer_size_t_();
683 }
684 
685 } // namespace utils
686 } // namespace genesis
687 
688 #endif // include guard
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:481
genesis::utils::InputStream::read_char_or_throw
char read_char_or_throw(char const criterion)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: input_stream.hpp:271
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::utils::InputStream::column
size_t column() const
Return the current column of the input stream.
Definition: input_stream.hpp:472
genesis::utils::InputStream::InputStream
InputStream(self_type &&other)
Definition: input_stream.hpp:127
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:522
genesis::utils::InputStream::self_type
InputStream self_type
Definition: input_stream.hpp:96
input_reader.hpp
genesis::utils::InputStream::buffer
std::pair< char const *, size_t > buffer()
Direct access to the internal buffer.
Definition: input_stream.hpp:545
genesis::utils::InputStream::get_char
char get_char()
Extract a single char from the input.
Definition: input_stream.hpp:224
std.hpp
Provides some valuable additions to STD.
genesis::utils::InputStream::operator++
self_type & operator++()
Move to the next char in the stream. Shortcut for advance().
Definition: input_stream.hpp:189
genesis::utils::InputStream::InputStream
InputStream()
Definition: input_stream.hpp:103
input_source.hpp
genesis::utils::InputStream::eof
bool eof() const
Return true iff the input reached its end.
Definition: input_stream.hpp:506
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::utils::InputStream::value_type
char value_type
Definition: input_stream.hpp:97
genesis::utils::InputStream::operator=
self_type & operator=(self_type const &)=delete
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::InputStream::parse_integer
T parse_integer()
Alias for parse_signed_integer().
Definition: input_stream.hpp:447
char.hpp
genesis::utils::InputStream::get_line
std::string get_line()
Read the current line and move to the beginning of the next.
Definition: input_stream.hpp:252
genesis::utils::InputStream::BlockLength
static const size_t BlockLength
Block length for internal buffering.
Definition: input_stream.hpp:94
genesis::utils::InputStream::line
size_t line() const
Return the current line of the input stream.
Definition: input_stream.hpp:461
genesis::utils::InputStream::current
char current() const
Return the current char, with some checks.
Definition: input_stream.hpp:162
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::utils::InputStream::parse_unsigned_integer
T parse_unsigned_integer()
Read an unsigned integer from a stream and return it.
Definition: input_stream.hpp:366
GENESIS_UNLIKELY
#define GENESIS_UNLIKELY
Definition: std.hpp:67
genesis::utils::InputStream::InputStream
InputStream(std::shared_ptr< BaseInputSource > input_source)
Definition: input_stream.hpp:113
genesis::utils::InputStream::parse_signed_integer
T parse_signed_integer()
Read a signed integer from a stream and return it.
Definition: input_stream.hpp:399
genesis::utils::InputStream::advance
self_type & advance()
Move to the next char in the stream and advance the counters.
Definition: input_stream.hpp:180
genesis::utils::InputStream::good
bool good() const
Return true iff the input is good (not end of data) and can be read from.
Definition: input_stream.hpp:489
genesis::utils::InputStream::read_char_or_throw
char read_char_or_throw(std::function< bool(char)> criterion)
Lexing function that reads a single char from the stream and checks whether it fulfills the provided ...
Definition: input_stream.hpp:294
genesis::utils::InputStream::jump_unchecked
void jump_unchecked(size_t n)
Jump forward in the stream by a certain amount of chars.
Definition: input_stream.hpp:561
genesis::utils::InputStream::operator*
char operator*() const
Dereference operator. Return the current char.
Definition: input_stream.hpp:143
genesis::utils::InputStream::~InputStream
~InputStream()
Definition: input_stream.hpp:120