A library for working with phylogenetic and population genetic data.
v0.32.0
input_stream.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_IO_INPUT_STREAM_H_
2 #define GENESIS_UTILS_IO_INPUT_STREAM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
38 
39 #include <array>
40 #include <cassert>
41 #include <cstdint>
42 #include <functional>
43 #include <memory>
44 #include <stdexcept>
45 #include <string>
46 #include <utility>
47 
48 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
49 
50  #include <string_view>
51 
52 #endif // ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
53 
54 namespace genesis {
55 namespace utils {
56 
57 // =================================================================================================
58 // Input Stream
59 // =================================================================================================
60 
89 {
90 public:
91 
92  // -------------------------------------------------------------------------
93  // Member Types
94  // -------------------------------------------------------------------------
95 
101  static const size_t BlockLength = 1 << 22;
102 
104  using value_type = char;
105 
106  // -------------------------------------------------------------------------
107  // Constructors and Rule of Five
108  // -------------------------------------------------------------------------
109 
111  : source_name_( "invalid source" )
112  , buffer_( nullptr )
113  , data_pos_( 0 )
114  , data_end_( 0 )
115  , current_( '\0' )
116  , line_( 0 )
117  , column_( 0 )
118  {}
119 
120  explicit InputStream( std::shared_ptr<BaseInputSource> input_source )
121  : line_( 1 )
122  , column_( 1 )
123  {
124  init_( input_source );
125  }
126 
128  {
129  delete[] buffer_;
130  buffer_ = nullptr;
131  }
132 
133  InputStream( self_type const& ) = delete;
135  : buffer_( nullptr )
136  {
137  *this = std::move( other );
138  }
139 
140  self_type& operator= ( self_type const& ) = delete;
141  self_type& operator= ( self_type&& other );
142 
143  // -------------------------------------------------------------------------
144  // Stream Iterator Operations
145  // -------------------------------------------------------------------------
146 
150  inline char operator* () const
151  {
152  return current_;
153  }
154 
169  inline char current() const
170  {
171  if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {
172  throw std::runtime_error(
173  "Unexpected end of " + source_name() + " at " + at() + "."
174  );
175  }
176  if( current_ < 0 ) GENESIS_UNLIKELY {
177  throw std::domain_error(
178  "Invalid input char in " + source_name() + " at " + at() + "."
179  );
180  }
181  return current_;
182  }
183 
187  inline self_type& advance()
188  {
189  operator++();
190  return *this;
191  }
192 
197  {
198  // If we were already at the end, set counter so zero.
199  if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {
200  reset_();
201  return *this;
202  }
203 
204  // Read data if necessary.
205  if( data_pos_ >= BlockLength ) GENESIS_UNLIKELY {
206  update_blocks_();
207  }
208  assert( data_pos_ < BlockLength );
209 
210  // In case we are moving to a new line, set the counters accordingly.
211  if( current_ == '\n' ) {
212  ++line_;
213  column_ = 1;
214  } else {
215  ++column_;
216  }
217 
218  // Next position.
219  ++data_pos_;
220 
221  // Set the char.
222  set_current_char_();
223  return *this;
224  }
225 
231  inline char get_char()
232  {
233  char ret = current_;
234  operator++();
235  return ret;
236  }
237 
238  // -------------------------------------------------------------------------
239  // Char Operations
240  // -------------------------------------------------------------------------
241 
250  char read_char_or_throw( char const criterion );
251 
252 
261  char read_char_or_throw( std::function<bool (char)> criterion );
262 
263  // -------------------------------------------------------------------------
264  // Line Operations
265  // -------------------------------------------------------------------------
266 
275  void get_line( std::string& target );
276 
284  std::string get_line()
285  {
286  std::string result;
287  get_line( result );
288  return result;
289  }
290 
291  #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
292 
314  std::string_view get_line_view();
315 
339  template<size_t N>
340  std::array<std::string_view, N> get_line_views()
341  {
342  // We relay the work to a non-template inner function, to keep this header clean.
343  std::array<std::string_view, N> result;
344  fill_line_views_( result.data(), result.size() );
345  return result;
346  }
347 
348 private:
349 
350  // Internal helper that does the actual work of the get_line_views() function.
351  void fill_line_views_( std::string_view* str_views, size_t n_lines );
352 
353  #endif // ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
354 
355 private:
356 
357  // Internal functions for the line operations.
358  size_t update_and_move_to_line_or_buffer_end_();
359  void move_to_line_or_buffer_end_( size_t const stop_pos );
360  void approach_line_or_buffer_end_avx512_( size_t const stop_pos );
361  void approach_line_or_buffer_end_avx2_( size_t const stop_pos );
362  void approach_line_or_buffer_end_unrolled_( size_t const stop_pos );
363  void increment_to_next_line_();
364 
365 public:
366 
367  // -------------------------------------------------------------------------
368  // Buffer Access
369  // -------------------------------------------------------------------------
370 
390  std::pair<char const*, size_t> buffer()
391  {
392  assert( data_pos_ <= data_end_ );
393  return { &buffer_[ data_pos_ ], data_end_ - data_pos_ };
394  }
395 
406  void jump_unchecked( size_t n );
407 
408  // -------------------------------------------------------------------------
409  // State
410  // -------------------------------------------------------------------------
411 
417  size_t line() const
418  {
419  return line_;
420  }
421 
428  size_t column() const
429  {
430  return column_;
431  }
432 
437  std::string at() const
438  {
439  return std::to_string( line_ ) + ":" + std::to_string( column_ );
440  }
441 
445  inline bool good() const
446  {
447  return data_pos_ < data_end_;
448  }
449 
454  inline explicit operator bool() const
455  {
456  return data_pos_ < data_end_;
457  }
458 
462  inline bool eof() const
463  {
464  return data_pos_ >= data_end_;
465  }
466 
478  std::string source_name() const
479  {
480  return source_name_;
481  }
482 
483  // -------------------------------------------------------------------------
484  // Internal Members
485  // -------------------------------------------------------------------------
486 
487 private:
488 
492  void init_( std::shared_ptr<BaseInputSource> input_source );
493 
497  void reset_()
498  {
499  line_ = 0;
500  column_ = 0;
501  current_ = '\0';
502  }
503 
507  void update_blocks_();
508 
513  void set_current_char_();
514 
515  // -------------------------------------------------------------------------
516  // Data Members
517  // -------------------------------------------------------------------------
518 
519 private:
520 
521  // Input data comes from here...
522  // (we use a unique ptr to make the class movable)
523  std::unique_ptr<InputReader> input_reader_ = nullptr;
524  std::string source_name_;
525 
526  // ...and is buffered here.
527  char* buffer_;
528  size_t data_pos_;
529  size_t data_end_;
530 
531  // Also, keep track of the current char and position in the data.
532  char current_;
533  size_t line_;
534  size_t column_;
535 };
536 
537 } // namespace utils
538 } // namespace genesis
539 
540 #endif // include guard
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:437
genesis::utils::InputStream::read_char_or_throw
char read_char_or_throw(char const criterion)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: input_stream.cpp:89
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::utils::InputStream::column
size_t column() const
Return the current column of the input stream.
Definition: input_stream.hpp:428
genesis::utils::InputStream::InputStream
InputStream(self_type &&other)
Definition: input_stream.hpp:134
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:478
genesis::utils::InputStream::self_type
InputStream self_type
Definition: input_stream.hpp:103
input_reader.hpp
genesis::utils::InputStream::buffer
std::pair< char const *, size_t > buffer()
Direct access to the internal buffer.
Definition: input_stream.hpp:390
genesis::utils::InputStream::get_char
char get_char()
Extract a single char from the input.
Definition: input_stream.hpp:231
std.hpp
Provides some valuable additions to STD.
genesis::utils::InputStream::operator++
self_type & operator++()
Move to the next char in the stream. Shortcut for advance().
Definition: input_stream.hpp:196
genesis::utils::InputStream::InputStream
InputStream()
Definition: input_stream.hpp:110
input_source.hpp
genesis::utils::InputStream::eof
bool eof() const
Return true iff the input reached its end.
Definition: input_stream.hpp:462
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::InputStream::value_type
char value_type
Definition: input_stream.hpp:104
genesis::utils::InputStream::operator=
self_type & operator=(self_type const &)=delete
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
char.hpp
genesis::utils::InputStream::get_line
std::string get_line()
Read the current line and move to the beginning of the next.
Definition: input_stream.hpp:284
genesis::utils::InputStream::BlockLength
static const size_t BlockLength
Block length for internal buffering.
Definition: input_stream.hpp:101
genesis::utils::InputStream::line
size_t line() const
Return the current line of the input stream.
Definition: input_stream.hpp:417
genesis::utils::InputStream::current
char current() const
Return the current char, with some checks.
Definition: input_stream.hpp:169
GENESIS_UNLIKELY
#define GENESIS_UNLIKELY
Definition: std.hpp:67
genesis::utils::InputStream::InputStream
InputStream(std::shared_ptr< BaseInputSource > input_source)
Definition: input_stream.hpp:120
genesis::utils::InputStream::advance
self_type & advance()
Move to the next char in the stream and advance the counters.
Definition: input_stream.hpp:187
genesis::utils::InputStream::good
bool good() const
Return true iff the input is good (not end of data) and can be read from.
Definition: input_stream.hpp:445
genesis::utils::InputStream::jump_unchecked
void jump_unchecked(size_t n)
Jump forward in the stream by a certain amount of chars.
Definition: input_stream.cpp:609
genesis::utils::InputStream::operator*
char operator*() const
Dereference operator. Return the current char.
Definition: input_stream.hpp:150
genesis::utils::InputStream::~InputStream
~InputStream()
Definition: input_stream.hpp:127