A library for working with phylogenetic and population genetic data.
v0.27.0
input_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
33 #include <algorithm>
34 #include <cassert>
35 #include <stdexcept>
36 
37 // For C++17, we have a little speedup in the integer parsing part.
38 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
39 
40  #include <charconv>
41 
42 #endif
43 
44 namespace genesis {
45 namespace utils {
46 
47 // =================================================================================================
48 // Constructors and Rule of Five
49 // =================================================================================================
50 
52 {
53  if( this == &other ) {
54  return *this;
55  }
56 
57  input_reader_ = std::move( other.input_reader_ );
58  source_name_ = std::move( other.source_name_ );
59 
60  // Need to free our current buffer.
61  if( buffer_ ) {
62  delete[] buffer_;
63  }
64 
65  // Move the data.
66  buffer_ = other.buffer_;
67  data_pos_ = other.data_pos_;
68  data_end_ = other.data_end_;
69  current_ = other.current_;
70  line_ = other.line_;
71  column_ = other.column_;
72 
73  // Set the other in a valid but empty state and avoid double freeing of the buffer.
74  other.buffer_ = nullptr;
75  other.data_pos_ = 0;
76  other.data_end_ = 0;
77  other.current_ = '\0';
78  other.line_ = 0;
79  other.column_ = 0;
80 
81  return *this;
82 }
83 
84 // =================================================================================================
85 // Line Operations
86 // =================================================================================================
87 
88 void InputStream::get_line( std::string& target )
89 {
90  // Check edge case.
91  if( data_pos_ >= data_end_ ) {
92  return;
93  }
94 
95  // Loop until we find the end of the line. As this can be longer than one block,
96  // we might need to update the blocks and store the results in between.
97  while( true ) {
98  // Read data if necessary.
99  if( data_pos_ >= BlockLength ) GENESIS_UNLIKELY {
100  update_blocks_();
101  }
102  assert( data_pos_ < BlockLength );
103 
104  // Store the starting position, so that we can copy from there once we found the end.
105  size_t const start = data_pos_;
106 
107  // Read until the end of the line, but also stop before the end of the data,
108  // and after we read a full block. End of data: we are done anyway.
109  // End of block: need to read the next one first, so loop again.
110  auto const stop = std::min( data_end_, start + BlockLength );
111 
112  // 8-fold loop unrolling. Yes, the compiler does not do that.
113  // It gives some speedup, in particular if the reading is used in a parser that also
114  // does other things with the data. In a stand-alone line reader, it still gives
115  // a slight advantage.
116  while(
117  data_pos_ + 7 < stop &&
118  buffer_[ data_pos_ + 0 ] != '\n' &&
119  buffer_[ data_pos_ + 0 ] != '\r' &&
120  buffer_[ data_pos_ + 1 ] != '\n' &&
121  buffer_[ data_pos_ + 1 ] != '\r' &&
122  buffer_[ data_pos_ + 2 ] != '\n' &&
123  buffer_[ data_pos_ + 2 ] != '\r' &&
124  buffer_[ data_pos_ + 3 ] != '\n' &&
125  buffer_[ data_pos_ + 3 ] != '\r' &&
126  buffer_[ data_pos_ + 4 ] != '\n' &&
127  buffer_[ data_pos_ + 4 ] != '\r' &&
128  buffer_[ data_pos_ + 5 ] != '\n' &&
129  buffer_[ data_pos_ + 5 ] != '\r' &&
130  buffer_[ data_pos_ + 6 ] != '\n' &&
131  buffer_[ data_pos_ + 6 ] != '\r' &&
132  buffer_[ data_pos_ + 7 ] != '\n' &&
133  buffer_[ data_pos_ + 7 ] != '\r'
134  ) {
135  data_pos_ += 8;
136  }
137 
138  // Working AVX version. Not worth the trouble as of now. Keeping it here for reference.
139 
140  // #ifdef GENESIS_AVX
141  // #include <immintrin.h>
142  // #endif
143  //
144  // auto b = _mm256_loadu_si256(( __m256i const* )( buffer_ + data_pos_ ));
145  //
146  // static auto const n = _mm256_set1_epi8( '\n' );
147  // static auto const r = _mm256_set1_epi8( '\r' );
148  //
149  // auto bn = _mm256_cmpeq_epi8( b, n );
150  // auto br = _mm256_cmpeq_epi8( b, r );
151  //
152  // while(
153  // data_pos_ + 32 <= stop &&
154  // _mm256_testz_si256( bn, bn ) &&
155  // _mm256_testz_si256( bn, bn )
156  // ) {
157  // data_pos_ += 32;
158  // b = _mm256_loadu_si256(( __m256i const* )( buffer_ + data_pos_ ));
159  // bn = _mm256_cmpeq_epi8( b, n );
160  // br = _mm256_cmpeq_epi8( b, r );
161  // }
162 
163  // Alternative version taht uses 64bit words instead, and hence works without AVX.
164  // Uses macros from https://graphics.stanford.edu/~seander/bithacks.html
165 
166  // static auto const nmask = ~static_cast<uint64_t>(0) / 255U * '\n';
167  // static auto const rmask = ~static_cast<uint64_t>(0) / 255U * '\r';
168  //
169  // #define haszero(v) (((v) - static_cast<uint64_t>(0x0101010101010101)) & ~(v) & static_cast<uint64_t>(0x8080808080808080))
170  // #define hasvalue(x,n) (haszero((x) ^ (~static_cast<uint64_t>(0) / 255U * (n))))
171  //
172  // auto const* buffc = reinterpret_cast<uint64_t const*>( buffer_ + data_pos_ );
173  // size_t i = 0;
174  // while( true ) {
175  // bool const e = i*8 >= data_end_;
176  // bool const b = i*8 - start >= BlockLength;
177  //
178  // // bool const n = buffc[i] ^ nmask;
179  // // bool const r = buffc[i] ^ rmask;
180  // bool const n = hasvalue( buffc[i], '\n' );
181  // bool const r = hasvalue( buffc[i], '\r' );
182  //
183  // if( e | b | n | r ) {
184  // break;
185  // }
186  //
187  // ++i;
188  // }
189  // data_pos_ += i*8;
190  //
191  // #undef haszero
192  // #undef hasvalue
193 
194  // The above loop ends with data_pos_ somewhere before the exact line break.
195  // We now need to walk the rest by foot, and examine char by char.
196  while(
197  data_pos_ < stop &&
198  buffer_[ data_pos_ ] != '\n' &&
199  buffer_[ data_pos_ ] != '\r'
200  ) {
201  ++data_pos_;
202  }
203 
204  // Store what we have so far.
205  target.append( buffer_ + start, data_pos_ - start );
206 
207  // If the line is not yet finished, we need an extra round. Start the loop again.
208  assert( data_pos_ >= start );
209  if( data_pos_ - start >= BlockLength ) {
210  continue;
211  }
212 
213  // In all other cases, we stop here.
214  break;
215  }
216 
217  // Some safty.
218  assert( data_pos_ <= data_end_ );
219  assert( data_pos_ < 2 * BlockLength );
220 
221  // Check all cases that can occur.
222  if( data_pos_ == data_end_ ) {
223 
224  // Files might be missing the line break at the end of the last line.
225  // We catch this case here, so that we can be sure that the next conditions
226  // are actually valid when accessing the buffer.
227  // But we don't need to do anything in this case.
228 
229  } else if( buffer_[ data_pos_ ] == '\n' ) {
230  ++data_pos_;
231 
232  } else if( buffer_[ data_pos_ ] == '\r' ) {
233  ++data_pos_;
234 
235  // Treat stupid Windows \r\n lines breaks.
236  // We already moved past the \r, so check the next char.
237  if( data_pos_ < data_end_ && buffer_[ data_pos_ ] == '\n' ) {
238  ++data_pos_;
239  }
240  } else {
241  // We have checked all cases where the loop above can terminate.
242  // So this should not happen.
243  assert( false );
244  }
245 
246  // Set char and counters. It checks for end of the file,
247  // so this is safe if we are past the end already.
248  set_current_char_();
249  ++line_;
250  column_ = 1;
251 }
252 
253 // =================================================================================================
254 // Parsing
255 // =================================================================================================
256 
257 // Only use intrinsics version for the compilers that support them!
258 #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
259 
260 size_t InputStream::parse_unsigned_integer_gcc_intrinsic_()
261 {
262  // This function only works on little endian systems (I think).
263  // We do not check this here, as so far, no one has tried to run our code on any machine
264  // that is not little endian. So we are good for now. In case this code needs to be adapted
265  // to big endian as well: I think the only change required is the `chunk <<= ...` that needs
266  // to turn into a right shift instead. Not entirely sure though.
267  // Also, in this function, we make use of the fact that our internal buffer is always way larger
268  // than any valid integer input. That is, we may read from after the block end, or even the
269  // stream end, but we have enough buffer for this to be okay (after all, we are just reading
270  // eight bytes here). We then check for this later.
271 
272  // Copy 8 bytes into a chunk that we process as one unit.
273  std::uint64_t chunk = 0;
274  std::memcpy( &chunk, &buffer_[ data_pos_ ], sizeof( chunk ));
275 
276  // Helper macro functions to check whether a word has bytes that are less than or greater
277  // than some specified value, and mark these bytes.
278  // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
279  // http://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord
280  auto const zero = static_cast<uint64_t>(0);
281  #define hasless(x,n) (((x)-~zero/255*(n))&~(x)&~zero/255*128)
282  #define hasmore(x,n) ((((x)+~zero/255*(127-(n)))|(x))&~zero/255*128)
283 
284  // Get all positions that are not digits, by marking a bit in their respective byte.
285  auto const l = hasless( chunk, '0' );
286  auto const m = hasmore( chunk, '9' );
287  auto const p = l | m;
288 
289  // Example:
290  // String "167\n253\n" turns into chunk c (on little endian systems)
291  // \n 3 5 2 \n 7 6 1
292  // c 00001010 00110011 00110101 00110010 00001010 00110111 00110110 00110001
293  // l 10000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000
294  // m 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
295  // p 10000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000
296  // ^ ^
297  // with the two '\n' bytes marked.
298 
299  #undef hasless
300  #undef hasmore
301 
302  // Find the index of the first byte that is not a digit. We first get the bit position
303  // using an intrinsic, and then divite by 8 to get the byte. The branching to select the
304  // correct intrinsic should be resolved at compile time already.
305  // We are using __builtin_ffs and its variants:
306  // Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero.
307  // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html#Other-Builtins
308  int idx = 0;
309  if( sizeof(int) == sizeof(std::uint64_t) ) {
310  idx = __builtin_ffs(p) / 8;
311  } else if( sizeof(long) == sizeof(std::uint64_t) ) {
312  idx = __builtin_ffsl(p) / 8;
313  } else if( sizeof(long long) == sizeof(std::uint64_t) ) {
314  idx = __builtin_ffsll(p) / 8;
315  } else {
316  static_assert(
317  ( sizeof(int) == sizeof(std::uint64_t) ) ||
318  ( sizeof(long) == sizeof(std::uint64_t) ) ||
319  ( sizeof(long long) == sizeof(std::uint64_t) ),
320  "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
321  );
322  throw std::runtime_error(
323  "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
324  );
325  }
326  assert( 0 <= idx && idx <= 8 );
327 
328  // Not needed but kept for reference: Mask out all bits that we do not want.
329  // auto const mask = ~(~zero << ((idx-1)*8));
330  // chunk &= mask;
331 
332  // On little endian systems, we need to move the actual data chars that we want to parse to the
333  // left-most position for the following code to work. So, for our example from above, we need
334  // to move the "xxxx x761" in the chunk so that we get "7610 0000".
335  chunk <<= (8 * ( 8 - idx + 1 ));
336 
337  // Now use an O(log(n)) method of computing the result, where we combine adjacent parts into
338  // numbers, first 2 bytes, then 4 bytes, then all 8 bytes. Inspired by parse_8_chars() from
339  // https://kholdstare.github.io/technical/2020/05/26/faster-integer-parsing.html
340 
341  // 1-byte mask trick (works on 4 pairs of single digits)
342  std::uint64_t lower_digits = (chunk & 0x0f000f000f000f00) >> 8;
343  std::uint64_t upper_digits = (chunk & 0x000f000f000f000f) * 10;
344  chunk = lower_digits + upper_digits;
345 
346  // 2-byte mask trick (works on 2 pairs of two digits)
347  lower_digits = (chunk & 0x00ff000000ff0000) >> 16;
348  upper_digits = (chunk & 0x000000ff000000ff) * 100;
349  chunk = lower_digits + upper_digits;
350 
351  // 4-byte mask trick (works on pair of four digits)
352  lower_digits = (chunk & 0x0000ffff00000000) >> 32;
353  upper_digits = (chunk & 0x000000000000ffff) * 10000;
354  chunk = lower_digits + upper_digits;
355 
356  // Edge cases. We treat them at the end, so that in the standard cases, the processor
357  // does not come to a grinding halt when trying to figure out if these cases apply;
358  // this might be premature optimization, but in our tests, it made the function slightly faster.
359  // If the returned index is 0, there was no non-digit byte in the chunk,
360  // so we run the naive loop instead. We could also call this function here again recursively,
361  // summing up parts of large numbers. But that would mean that we need to do overflow
362  // detection and all that, and currently, this does not seem needed. Let's be lazy today.
363  // Furthermore, if the 8 bytes that we process here are at the end of the stream, we cannot
364  // confidently use them, in cases for example where the stream ends in a number, but does
365  // not have a new line char at the end. So in that case, better parse naievely.
366  // Lastly, if the index is 1, the first byte is not a digit, which is an error, as this function
367  // is only called from parsers that expect a number.
368  if( idx == 0 || data_pos_ + 8 >= data_end_ ) {
369  return parse_unsigned_integer_naive_();
370  }
371  if( idx == 1 ) {
372  throw std::runtime_error(
373  "Expecting integer in " + source_name() + " at " + at() + "."
374  );
375  }
376 
377  // Now move as far as needed in the buffer...
378  data_pos_ += idx - 1;
379  column_ += idx - 1;
380  set_current_char_();
381 
382  // ...and finally initiate the next block if needed.
383  if( data_pos_ >= BlockLength ) {
384  update_blocks_();
385  }
386  assert( data_pos_ < BlockLength );
387 
388  return chunk;
389 }
390 
391 size_t InputStream::parse_unsigned_integer_from_chars_()
392 {
393 
394  // Re-implementation of the gcc from_chars() code.
395  // https://github.com/gcc-mirror/gcc/blob/12bb62fbb47bd2848746da53c72ed068a4274daf/libstdc++-v3/include/std/charconv
396  // Currently not in use and not well tested!
397 
398  // Prepare. We alias T, in case we want to refactor to a template function at some point.
399  using T = size_t;
400  using namespace utils;
401  T x = 0;
402 
403  int const base = 10;
404  auto raise_and_add_ = [base]( T& val, unsigned char c ) {
405  return !(
406  __builtin_mul_overflow( val, base, &val ) ||
407  __builtin_add_overflow( val, c, &val )
408  );
409  };
410 
411  auto from_chars_digit_ = [&]( char const*& first, char const* last, T& val ) {
412  while( first != last ) {
413  char const c = *first;
414  if( is_digit(c) ) {
415  if( !raise_and_add_(val, c - '0') ) {
416  return false;
417  }
418  first++;
419  } else {
420  return true;
421  }
422  }
423  return true;
424  };
425 
426  char const* start = &buffer_[ data_pos_ ];
427  char const* end = &buffer_[ data_end_ ];
428  auto const valid = from_chars_digit_( start, end, x );
429  auto const dist = start - &buffer_[ data_pos_ ];
430 
431  if( dist == 0 ) {
432  throw std::runtime_error(
433  "Expecting integer in " + source_name() + " at " + at() + "."
434  );
435  } else if( !valid ) {
436  throw std::overflow_error(
437  "Numerical overflow in " + source_name() + " at " + at() + "."
438  );
439  } else if( std::is_signed<T>::value ) {
440  assert( false );
441  // T tmp;
442  // if (__builtin_mul_overflow(x, sign, &tmp)) {
443  // throw std::overflow_error(
444  // "Numerical overflow in " + source_name() + " at " + at() + "."
445  // );
446  // }
447  }
448 
449  // Move to where we the parsing left us.
450  data_pos_ += dist;
451  column_ += dist;
452  set_current_char_();
453 
454  // Now finally initiate the next block if needed.
455  if( data_pos_ >= BlockLength ) {
456  update_blocks_();
457  }
458  assert( data_pos_ < BlockLength );
459 
460  return x;
461 }
462 
463 #endif // defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
464 
465 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
466 
467 size_t InputStream::parse_unsigned_integer_std_from_chars_()
468 {
469  // Uses the C++17 std::from_chars() function.
470  // Currently not in use and not well tested!
471 
472  // Prepare. We alias T, in case we want to refactor to a template function at some point.
473  using T = size_t;
474  using namespace utils;
475  T x = 0;
476 
477  // Fastest method accoing to
478  // https://www.fluentcpp.com/2018/07/27/how-to-efficiently-convert-a-string-to-an-int-in-c/
479  // is from_chars(), so let's us it!
480 
481  auto const conv = std::from_chars( &buffer_[ data_pos_ ], &buffer_[ data_end_ ], x );
482 
483  // How many chars did we consume?
484  auto const dist = conv.ptr - &buffer_[ data_pos_ ];
485 
486  // Check that we processed at least one digit, as this function is only called when the
487  // input format requires an integer. This is equivalent to the check in the non C++17 version
488  // below for data_pos_ >= data_end_ || ! is_digit( current_ )
489  if( dist == 0 ) {
490  throw std::runtime_error(
491  "Expecting integer in " + source_name() + " at " + at() + "."
492  );
493  }
494 
495  if( conv.ec != std::errc() ) {
496  if( conv.ec == std::errc::result_out_of_range ) {
497  throw std::overflow_error(
498  "Numerical overflow in " + source_name() + " at " + at() + "."
499  );
500  } else if( conv.ec == std::errc::invalid_argument ) {
501  // Cannot happen, as we above checked that there is at least one digit.
502  assert( false );
503  } else {
504  // Cannot happen, as we caught every case of `ec`.
505  assert( false );
506  }
507 
508  // In either case, we need to stop here.
509  throw std::overflow_error(
510  "Integer parsing error in " + source_name() + " at " + at() + "."
511  );
512  }
513 
514  // Move to where we the parsing left us.
515  column_ += dist;
516  data_pos_ += dist;
517 
518  // Now finally initiate the next block if needed.
519  if( data_pos_ >= BlockLength ) {
520  update_blocks_();
521  }
522  assert( data_pos_ < BlockLength );
523 
524  // Finally we also need to update the char so that new lines are taken care of.
525  set_current_char_();
526 
527  return x;
528 }
529 
530 #endif // ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
531 
532 size_t InputStream::parse_unsigned_integer_naive_()
533 {
534  // Prepare. We alias T, in case we want to refactor to a template function at some point.
535  using T = size_t;
536  using namespace utils;
537  T x = 0;
538 
539  if( data_pos_ >= data_end_ || ! is_digit( current_ ) ) {
540  throw std::runtime_error(
541  "Expecting integer in " + source_name() + " at " + at() + "."
542  );
543  }
544 
545  while(( data_pos_ < data_end_ ) && is_digit( current_ )) {
546  T y = current_ - '0';
547 
548  if( x > ( std::numeric_limits<T>::max() - y ) / 10 ) {
549  throw std::overflow_error(
550  "Numerical overflow in " + source_name() + " at " + at() + "."
551  );
552  }
553 
554  x = 10 * x + y;
555 
556  // In the original function that was not part of this class, we simply called
557  // advance() here, to move to the next char. However, here, we already know that
558  // we have data_pos_ < data_end_, and that we do not have a new line.
559  // Furthermore, we also can ignore the update for block length while in this loop
560  // (or maybe even completely), as it does not matter much if we move a bit into the
561  // second block before starting the reading thread again. This loop here cannot
562  // iterate that many times anyway before we overflow the interger.
563  // So let's simply move on to the next char.
564  // advance();
565  assert( data_pos_ < data_end_ );
566  assert( current_ != '\n' );
567  ++column_;
568  ++data_pos_;
569  current_ = buffer_[ data_pos_ ];
570  }
571 
572  // Now finally initiate the next block if needed.
573  if( data_pos_ >= BlockLength ) {
574  update_blocks_();
575  }
576  assert( data_pos_ < BlockLength );
577 
578  // Finally we also need to update the char so that new lines are taken care of.
579  set_current_char_();
580 
581  return x;
582 }
583 
584 size_t InputStream::parse_unsigned_integer_size_t_()
585 {
586  // Select the fastest alternative available for a given compiler and C++ version.
587  #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
588 
589  // If we have GCC or Clang, use our own handcrafted fast-as-hell implementation.
590  return parse_unsigned_integer_gcc_intrinsic_();
591 
592  // #elif ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
593  //
594  // // Otherwise, if this is C++17, at least use its own fast version,
595  // // that can use some compiler intrinsics itself.
596  // return parse_unsigned_integer_std_from_chars_();
597 
598  #else
599 
600  // If neither, just use the slow, naive loop.
601  return parse_unsigned_integer_naive_();
602 
603  #endif
604 }
605 
606 // =================================================================================================
607 // Internal Members
608 // =================================================================================================
609 
610 void InputStream::init_( std::shared_ptr<BaseInputSource> input_source )
611 {
612  // Set to empty defaults if there is no input.
613  if( input_source == nullptr ) {
614  source_name_ = "invalid source";
615 
616  buffer_ = nullptr;
617  data_pos_ = 0;
618  data_end_ = 0;
619 
620  current_ = '\0';
621  line_ = 0;
622  column_ = 0;
623  return;
624  }
625 
626  // We use three buffer blocks: The first two for the current block/line.
627  // The max line length is one buffer length, so the beginning of the line is always
628  // in the first block, while its end can reach into the second block, but never exeed it.
629  // The third block is for the async reading.
630  buffer_ = new char[ 3 * BlockLength ];
631 
632  try {
633  // Set source name.
634  source_name_ = input_source->source_name();
635 
636  // Read up to two blocks.
637  data_pos_ = 0;
638  data_end_ = input_source->read( buffer_, 2 * BlockLength );
639 
640  // Skip UTF-8 BOM, if found.
641  if( data_end_ >= 3 &&
642  buffer_[0] == '\xEF' &&
643  buffer_[1] == '\xBB' &&
644  buffer_[2] == '\xBF'
645  ) {
646  data_pos_ = 3;
647  }
648 
649  // If there was no data, set to "empty" values.
650  if( data_pos_ == data_end_ ) {
651  reset_();
652 
653  // If there is data, set char value.
654  } else {
655  set_current_char_();
656  }
657 
658  // If there is more data after the two blocks that we just read, start the
659  // reading process (possibly async, if pthreads is available).
660  if( data_end_ == 2 * BlockLength ) {
661 
662  // Create the reader. We need to do this explictily,
663  // as we use a unique ptr to make this class movable.
664  input_reader_ = utils::make_unique<InputReader>();
665 
666  input_reader_->init( input_source );
667  input_reader_->start_reading( buffer_ + 2 * BlockLength, BlockLength );
668  }
669 
670  } catch( ... ) {
671  delete[] buffer_;
672  throw;
673  }
674 }
675 
676 void InputStream::update_blocks_()
677 {
678  // This function is only called locally in contexts where we already know that we need to
679  // update the blocks. We only assert this here again, meaning that we expect the caller
680  // functions to check for this already. Handling it this way ensures that the function
681  // jump is only made when necesary.
682  assert( data_pos_ >= BlockLength );
683 
684  // Furthermore, the callers need to check the following condition. So, if it breaks, this
685  // function is invalidly called from somewhere else.
686  assert( data_pos_ < data_end_ );
687 
688  // If this assertion breaks, someone tempered with our internal invariants.
689  assert( data_end_ <= BlockLength * 2 );
690 
691  // Move the second to the first block.
692  std::memcpy( buffer_, buffer_ + BlockLength, BlockLength );
693  data_pos_ -= BlockLength;
694  data_end_ -= BlockLength;
695 
696  // If we are not yet at the end of the data, start the reader again:
697  // Copy the third block to the second, and read into the third one.
698  if( input_reader_ && input_reader_->valid() ) {
699  data_end_ += input_reader_->finish_reading();
700  std::memcpy( buffer_ + BlockLength, buffer_ + 2 * BlockLength, BlockLength );
701  input_reader_->start_reading( buffer_ + 2 * BlockLength, BlockLength );
702  }
703 
704  // After the update, the current position needs to be within the first block.
705  assert( data_pos_ < BlockLength );
706 }
707 
708 } // namespace utils
709 } // namespace genesis
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:481
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:81
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:522
input_stream.hpp
genesis::utils::InputStream::operator=
self_type & operator=(self_type const &)=delete
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
genesis::utils::InputStream::get_line
std::string get_line()
Read the current line and move to the beginning of the next.
Definition: input_stream.hpp:252
genesis::utils::InputStream::BlockLength
static const size_t BlockLength
Block length for internal buffering.
Definition: input_stream.hpp:94
GENESIS_UNLIKELY
#define GENESIS_UNLIKELY
Definition: std.hpp:67