A library for working with phylogenetic and population genetic data.
v0.32.0
input_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
34 
35 #include <algorithm>
36 #include <cassert>
37 #include <stdexcept>
38 
39 #if defined(GENESIS_AVX) || defined(GENESIS_AVX2) || defined(GENESIS_AVX512)
40 
41  #include <immintrin.h>
42 
43 #endif
44 
45 namespace genesis {
46 namespace utils {
47 
48 // =================================================================================================
49 // Constructors and Rule of Five
50 // =================================================================================================
51 
53 {
54  if( this == &other ) {
55  return *this;
56  }
57 
58  input_reader_ = std::move( other.input_reader_ );
59  source_name_ = std::move( other.source_name_ );
60 
61  // Need to free our current buffer.
62  if( buffer_ ) {
63  delete[] buffer_;
64  }
65 
66  // Move the data.
67  buffer_ = other.buffer_;
68  data_pos_ = other.data_pos_;
69  data_end_ = other.data_end_;
70  current_ = other.current_;
71  line_ = other.line_;
72  column_ = other.column_;
73 
74  // Set the other in a valid but empty state and avoid double freeing of the buffer.
75  other.buffer_ = nullptr;
76  other.data_pos_ = 0;
77  other.data_end_ = 0;
78  other.current_ = '\0';
79  other.line_ = 0;
80  other.column_ = 0;
81 
82  return *this;
83 }
84 
85 // =================================================================================================
86 // Char Operations
87 // =================================================================================================
88 
89 char InputStream::read_char_or_throw( char const criterion )
90 {
91  // Check char and move to next.
92  if( data_pos_ >= data_end_ || current_ != criterion ) GENESIS_UNLIKELY {
93  throw std::runtime_error(
94  std::string("In ") + source_name() + ": " +
95  "Expecting " + char_to_hex( criterion ) + " at " + at() + ", " +
96  "but received " + char_to_hex( current_ ) + " instead."
97  );
98  }
99  assert( good() && current_ == criterion );
100  operator++();
101  return criterion;
102 }
103 
104 char InputStream::read_char_or_throw( std::function<bool (char)> criterion )
105 {
106  // Check char and move to next.
107  if( data_pos_ >= data_end_ || !criterion( current_ )) GENESIS_UNLIKELY {
108  throw std::runtime_error(
109  std::string("In ") + source_name() + ": " +
110  "Unexpected char " + char_to_hex( current_ ) + " at " + at() + "."
111  );
112  }
113  assert( good() );
114  auto const chr = current_;
115  operator++();
116  return chr;
117 }
118 
119 // =================================================================================================
120 // Line Operations
121 // =================================================================================================
122 
123 // -------------------------------------------------------------------------
124 // get_line
125 // -------------------------------------------------------------------------
126 
127 void InputStream::get_line( std::string& target )
128 {
129  // Check edge case.
130  if( data_pos_ >= data_end_ ) {
131  return;
132  }
133 
134  // Loop until we find the end of the line. As this can be longer than one block,
135  // we might need to update the blocks and store the results in between.
136  while( true ) {
137 
138  // Move data_pos_ to the end of the line or end of the buffered data.
139  // We end at either the end of the data, or have moved a whole block
140  // or until we found a new line character.
141  auto const move_dist = update_and_move_to_line_or_buffer_end_();
142  assert(
143  data_pos_ == data_end_ ||
144  move_dist == BlockLength ||
145  buffer_[ data_pos_ ] == '\n' ||
146  buffer_[ data_pos_ ] == '\r'
147  );
148  assert( move_dist <= BlockLength );
149  assert( move_dist <= data_pos_ );
150 
151  // Store what we have so far.
152  target.append( buffer_ + data_pos_ - move_dist, move_dist );
153 
154  // If the line is not yet finished, we need an extra round. Start the loop again.
155  if( move_dist == BlockLength ) {
156  continue;
157  }
158 
159  // In all other cases, we stop here. Either we are at the end of the data,
160  // or have found the characters we are looking for.
161  break;
162  }
163 
164  // If we are here, we have either found our char and are at the end of the line, or we have
165  // reached the end of the input. In the first case, we move to the beginning of the next line.
166  assert( data_pos_ == data_end_ || buffer_[ data_pos_ ] == '\n' || buffer_[ data_pos_ ] == '\r' );
167  increment_to_next_line_();
168  assert( data_pos_ == data_end_ || column_ == 1 );
169 }
170 
171 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
172 
173 // -------------------------------------------------------------------------
174 // get_line_view
175 // -------------------------------------------------------------------------
176 
177 std::string_view InputStream::get_line_view()
178 {
179  // Check edge case.
180  if( data_pos_ >= data_end_ ) {
181  return std::string_view();
182  }
183 
184  // Move data_pos_ to the end of the line or end of the buffered data.
185  // Similar to the above get_line().
186  auto const move_dist = update_and_move_to_line_or_buffer_end_();
187  assert(
188  data_pos_ == data_end_ ||
189  move_dist == BlockLength ||
190  buffer_[ data_pos_ ] == '\n' ||
191  buffer_[ data_pos_ ] == '\r'
192  );
193  assert( move_dist <= BlockLength );
194  assert( move_dist <= data_pos_ );
195 
196  // If the line is not yet finished after a full block, we cannot use this function.
197  if( move_dist == BlockLength ) {
198  throw std::runtime_error(
199  "Cannot call InputStream::get_line_view() on lines that are longer "
200  "than the internal buffer of " + to_string_byte_format( BlockLength ) + " bytes"
201  );
202  }
203 
204  // We have moved, and might also have updated the blocks before, so we need to work backwards
205  // from where we are now to get the positions we want for our view.
206  auto result = std::string_view( buffer_ + data_pos_ - move_dist, move_dist );
207 
208  // If we are here, we have either found our char and are at the end of the line, or we have
209  // reached the end of the input. In the first case, we move to the beginning of the next line.
210  // The function below shall not call update_blocks_(), as otherwise our return value
211  // might be invalidated.
212  assert( data_pos_ == data_end_ || buffer_[ data_pos_ ] == '\n' || buffer_[ data_pos_ ] == '\r' );
213  increment_to_next_line_();
214  assert( data_pos_ == data_end_ || column_ == 1 );
215 
216  // Now we are at the beginning of the next line, and can return our result.
217  return result;
218 }
219 
220 // -------------------------------------------------------------------------
221 // fill_line_views_
222 // -------------------------------------------------------------------------
223 
224 void InputStream::fill_line_views_( std::string_view* str_views, size_t n_lines )
225 {
226  // Check edge case.
227  if( data_pos_ >= data_end_ ) {
228  return;
229  }
230 
231  // Read data if necessary. After this, we are guaranteed to have data_pos_ in the first block.
232  // We need to do the update here once, and do not do it again for the rest of the function,
233  // so as to not invalidate the string views.
234  if( data_pos_ >= BlockLength ) {
235  update_blocks_();
236  }
237  assert( data_pos_ < BlockLength );
238 
239  // Store the overall starting position, so that we know when we went too far.
240  size_t const total_start_pos = data_pos_;
241 
242  // We need to stop before the end of the data, and before the end of the second block.
243  // As a safeguard, we are not reading more than one block length away from the current pos.
244  auto const stop_pos = std::min( data_end_, total_start_pos + BlockLength );
245 
246  // Fill the lines with string views.
247  for( size_t i = 0; i < n_lines; ++i ) {
248  // Store the line starting position, so that we can copy from there once we found the end.
249  size_t const start_pos = data_pos_;
250 
251  // Check case that we do not have enough lines in the file any more.
252  if( data_pos_ >= data_end_ ) {
253  throw std::runtime_error(
254  "Reached the end of input before reading " + std::to_string( n_lines ) +
255  " lines from " + source_name()
256  );
257  }
258 
259  // Move data_pos_ to the new line char or to the stop position.
260  move_to_line_or_buffer_end_( stop_pos );
261 
262  // Now we are either at the new line character, or at the end of the current data.
263  assert( data_pos_ >= start_pos );
264  assert( data_pos_ == stop_pos || buffer_[ data_pos_ ] == '\n' || buffer_[ data_pos_ ] == '\r' );
265  assert( stop_pos == data_end_ || stop_pos == total_start_pos + BlockLength );
266 
267  // Check that we are still within bounds. We include here that we need another char for the
268  // new line (or two, if it is a \r\n combination), which we will process next.
269  // If the sum of lines is not yet finished after a full block, we cannot use this function.
270  // This has an edge case where the data edge is also exactly the end of one block length.
271  // But that is so close to the failure condition anyway (just one or two chars off) that we
272  // just treat that as an error as well, for simplicity. Shouldn't matter if we are allowed
273  // to read lines of 4MB or 4MB minus 2B.
274  if( data_pos_ >= total_start_pos + BlockLength - 2 ) {
275  throw std::runtime_error(
276  "Cannot call InputStream::get_line_views() on lines that are in sum longer "
277  "than the internal buffer of " + to_string_byte_format( BlockLength ) + " bytes"
278  );
279  }
280 
281  // Store a view of the range that we found.
282  str_views[i] = std::string_view( buffer_ + start_pos, data_pos_ - start_pos );
283 
284  // If we are here, we have either found our char and are at the end of the line, or we have
285  // reached the end of the input. In the first case, we move to the beginning of the next line.
286  // The function below shall not call update_blocks_(), as otherwise our return value
287  // might be invalidated.
288  assert( data_pos_ == data_end_ || buffer_[ data_pos_ ] == '\n' || buffer_[ data_pos_ ] == '\r' );
289  increment_to_next_line_();
290  assert( data_pos_ == data_end_ || column_ == 1 );
291  }
292 }
293 
294 #endif // ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
295 
296 // -------------------------------------------------------------------------
297 // update_and_move_to_line_or_buffer_end_
298 // -------------------------------------------------------------------------
299 
300 size_t InputStream::update_and_move_to_line_or_buffer_end_()
301 {
302  // The caller needs to guarantee that we are not at the end,
303  // because the caller would need to react to that in their own way.
304  assert( data_pos_ < data_end_ );
305 
306  // Read data if necessary. After this, we are guaranteed to have data_pos_ in the first block.
307  if( data_pos_ >= BlockLength ) GENESIS_UNLIKELY {
308  update_blocks_();
309  }
310  assert( data_pos_ < BlockLength );
311 
312  // Store the starting position, so that we can copy from there once we found the end.
313  size_t const start_pos = data_pos_;
314 
315  // We need to stop before the end of the data, and before the end of the second block.
316  // As a safeguard, we are not reading more than one block length away from the current pos.
317  auto const stop_pos = std::min( data_end_, data_pos_ + BlockLength );
318 
319  // Move data_pos_ to the new line char or to the stop position.
320  move_to_line_or_buffer_end_( stop_pos );
321 
322  // Now we are either at the new line character, or at the end of the current data.
323  // We return how far we moved: A whole block, or where we found the new line.
324  assert( data_pos_ >= start_pos );
325  assert( data_pos_ == stop_pos || buffer_[ data_pos_ ] == '\n' || buffer_[ data_pos_ ] == '\r' );
326  return data_pos_ - start_pos;
327 }
328 
329 // -------------------------------------------------------------------------
330 // move_to_line_or_buffer_end_
331 // -------------------------------------------------------------------------
332 
333 void InputStream::move_to_line_or_buffer_end_( size_t const stop_pos )
334 {
335  // Pick the fastest implementation available to move data_pos_ as close as possible
336  // to the next nl or cr character, without overshooting the end of the data or block.
337  #if defined(GENESIS_AVX512)
338  approach_line_or_buffer_end_avx512_( stop_pos );
339  #elif defined(GENESIS_AVX2)
340  approach_line_or_buffer_end_avx2_( stop_pos );
341  #else
342  approach_line_or_buffer_end_unrolled_( stop_pos );
343  #endif
344 
345  // The above approach loops might end with data_pos_ somewhere before the exact line break.
346  // In those cases, we need to walk the rest by foot, and examine char by char.
347  while(
348  data_pos_ < stop_pos &&
349  buffer_[ data_pos_ ] != '\n' &&
350  buffer_[ data_pos_ ] != '\r'
351  ) {
352  ++data_pos_;
353  }
354 }
355 
356 // -------------------------------------------------------------------------
357 // approach_line_or_buffer_end_avx512_
358 // -------------------------------------------------------------------------
359 
360 #if defined(GENESIS_AVX512)
361 
362 void InputStream::approach_line_or_buffer_end_avx512_( size_t const stop_pos )
363 {
364  // UNTESTED and hence unused - we never set the GENESIS_AVX512 definition
365 
366  static auto const all_nl = _mm512_set1_epi8('\n');
367  static auto const all_cr = _mm512_set1_epi8('\r');
368 
369  // Process 64 bytes at a time using AVX-512
370  while( data_pos_ + 64 <= stop_pos ) {
371  auto data_64bytes = _mm512_loadu_si512(
372  reinterpret_cast<__m512i const*>( buffer_ + data_pos_ )
373  );
374 
375  // Compare each byte in the chunk with '\n' and '\r'
376  auto nl_pos = _mm512_cmpeq_epi8_mask(data_64bytes, all_nl);
377  auto cr_pos = _mm512_cmpeq_epi8_mask(data_64bytes, all_cr);
378 
379  // Combine the results of the comparisons.
380  // If this is non-zero, we have found a nl or cr character.
381  auto nr_pos = nl_pos | cr_pos;
382 
383  // Check if any of the comparisons were true.
384  if( nr_pos != 0 ) {
385  // Find the position of the first set bit, using intrinsics.
386  int offset = _tzcnt_u64(nr_pos);
387  data_pos_ += offset;
388  break;
389  }
390 
391  data_pos_ += 64;
392  }
393 }
394 
395 #else // defined(GENESIS_AVX512)
396 
397 void InputStream::approach_line_or_buffer_end_avx512_( size_t const stop_pos )
398 {
399  // Avoid compiler complaints when below code is not processed.
400  (void) stop_pos;
401  assert( false );
402 }
403 
404 #endif // defined(GENESIS_AVX512)
405 
406 // -------------------------------------------------------------------------
407 // approach_line_or_buffer_end_avx2_
408 // -------------------------------------------------------------------------
409 
410 #if defined(GENESIS_AVX2)
411 
412 void InputStream::approach_line_or_buffer_end_avx2_( size_t const stop_pos )
413 {
414  // 32 byte masks where each byte is new line or carriage return.
415  static auto const all_nl = _mm256_set1_epi8( '\n' );
416  static auto const all_cr = _mm256_set1_epi8( '\r' );
417 
418  // Load chunks of 32 bytes and loop until one of them contains nl or cr,
419  // or we reach the end of what we can currently process.
420  int mask = 0;
421  bool aligned = reinterpret_cast<uintptr_t>( buffer_ + data_pos_ ) % 32 == 0;
422  while( data_pos_ + 32 <= stop_pos ) {
423 
424  // Load 32 bytes of data. We first do an unaligned load for the first iteration,
425  // and then move forward to the next alignment boundary, so that subsequent
426  // iterations can use aligned load. On average this will double check 16 bytes,
427  // which might be slower when the data consists of many very short lines.
428  // But typically, that is not the case, and then this gives significant speedup.
429  __m256i data_chunk;
430  if( aligned ) {
431  assert( reinterpret_cast<uintptr_t>( buffer_ + data_pos_ ) % 32 == 0 );
432  data_chunk = _mm256_load_si256(
433  reinterpret_cast<__m256i const*>( buffer_ + data_pos_ )
434  );
435  } else {
436  data_chunk = _mm256_loadu_si256(
437  reinterpret_cast<__m256i const*>( buffer_ + data_pos_ )
438  );
439  }
440 
441  // Compare the data with the masks, setting bits where they match,
442  // and combining them into one mask that we then evaluate.
443  auto const nl_pos = _mm256_cmpeq_epi8( data_chunk, all_nl );
444  auto const cr_pos = _mm256_cmpeq_epi8( data_chunk, all_cr );
445  auto const nr_pos = _mm256_or_si256( nl_pos, cr_pos );
446 
447  // Get a bit mask that is set wherever nl or cr are.
448  // If there is a bit set, we are done with the loop.
449  mask = _mm256_movemask_epi8( nr_pos );
450  if( mask != 0 ) {
451  break;
452  }
453  if( aligned ) {
454  data_pos_ += 32;
455  } else {
456  auto const remainder = reinterpret_cast<uintptr_t>( buffer_ + data_pos_ ) % 32;
457  data_pos_ += 32 - remainder;
458  aligned = true;
459  }
460  }
461 
462  // If we have builtin capabilities to find the first set bit, we use it.
463  // This brings data_pos_ to where the nl or cr is, so that the slow loop at the end of
464  // move_to_line_or_buffer_end_() will not run. If we do not have the builtin, we instead use
465  // the loop in move_to_line_or_buffer_end_() to find the exact position of the new line char.
466  #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
467 
468  // If we found a new line, use the mask to get position of the first set bit.
469  // This is where the nl or cr character is located, so we move there.
470  if( mask != 0 ) {
471  auto const offset = __builtin_ctz(mask);
472  data_pos_ += offset;
473  assert( data_pos_ <= stop_pos );
474  assert( buffer_[ data_pos_ ] == '\n' || buffer_[ data_pos_ ] == '\r' );
475  } else {
476  assert( data_pos_ + 32 > stop_pos );
477  }
478 
479  #else
480 
481  // Without the builtin, we at least do a bit of loop unrolling to get closer
482  // to where we want to be - the new line or the end of the data.
483  approach_line_or_buffer_end_unrolled_( stop_pos );
484 
485  #endif // defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
486 }
487 
488 #else // defined(GENESIS_AVX2)
489 
490 void InputStream::approach_line_or_buffer_end_avx2_( size_t const stop_pos )
491 {
492  // Avoid compiler complaints when below code is not processed.
493  (void) stop_pos;
494  assert( false );
495 }
496 
497 #endif // defined(GENESIS_AVX2)
498 
499 // -------------------------------------------------------------------------
500 // approach_line_or_buffer_end_unrolled_
501 // -------------------------------------------------------------------------
502 
503 void InputStream::approach_line_or_buffer_end_unrolled_( size_t const stop_pos )
504 {
505  // 8-fold loop unrolling, to help the compiler.
506  // It gives some speedup, in particular if the reading is used in a parser that also
507  // does other things with the data. In a stand-alone line reader, it still gives
508  // a slight advantage.
509  while(
510  data_pos_ + 7 < stop_pos &&
511  buffer_[ data_pos_ + 0 ] != '\n' &&
512  buffer_[ data_pos_ + 0 ] != '\r' &&
513  buffer_[ data_pos_ + 1 ] != '\n' &&
514  buffer_[ data_pos_ + 1 ] != '\r' &&
515  buffer_[ data_pos_ + 2 ] != '\n' &&
516  buffer_[ data_pos_ + 2 ] != '\r' &&
517  buffer_[ data_pos_ + 3 ] != '\n' &&
518  buffer_[ data_pos_ + 3 ] != '\r' &&
519  buffer_[ data_pos_ + 4 ] != '\n' &&
520  buffer_[ data_pos_ + 4 ] != '\r' &&
521  buffer_[ data_pos_ + 5 ] != '\n' &&
522  buffer_[ data_pos_ + 5 ] != '\r' &&
523  buffer_[ data_pos_ + 6 ] != '\n' &&
524  buffer_[ data_pos_ + 6 ] != '\r' &&
525  buffer_[ data_pos_ + 7 ] != '\n' &&
526  buffer_[ data_pos_ + 7 ] != '\r'
527  ) {
528  data_pos_ += 8;
529  }
530 
531  // Alternative version that uses 64bit words instead, and hence works without AVX.
532  // Uses macros from https://graphics.stanford.edu/~seander/bithacks.html
533 
534  // static auto const nmask = ~static_cast<uint64_t>(0) / 255U * '\n';
535  // static auto const rmask = ~static_cast<uint64_t>(0) / 255U * '\r';
536 
537  // #define haszero(v) (((v) - static_cast<uint64_t>(0x0101010101010101)) & ~(v) & static_cast<uint64_t>(0x8080808080808080))
538  // #define hasvalue(x,n) (haszero((x) ^ (~static_cast<uint64_t>(0) / 255U * (n))))
539  //
540  // auto const* buffc = reinterpret_cast<uint64_t const*>( buffer_ + data_pos_ );
541  // size_t i = 0;
542  // while( true ) {
543  // bool const e = i*8 >= data_end_;
544  // bool const b = i*8 - start >= BlockLength;
545  //
546  // // bool const n = buffc[i] ^ nmask;
547  // // bool const r = buffc[i] ^ rmask;
548  // bool const n = hasvalue( buffc[i], '\n' );
549  // bool const r = hasvalue( buffc[i], '\r' );
550  //
551  // if( e | b | n | r ) {
552  // break;
553  // }
554  //
555  // ++i;
556  // }
557  // data_pos_ += i*8;
558  //
559  // #undef haszero
560  // #undef hasvalue
561 }
562 
563 // -------------------------------------------------------------------------
564 // increment_to_next_line_
565 // -------------------------------------------------------------------------
566 
567 void InputStream::increment_to_next_line_()
568 {
569  // Some safty.
570  assert( data_pos_ <= data_end_ );
571  assert( data_pos_ < 2 * BlockLength );
572 
573  // Check all cases that can occur.
574  if( data_pos_ == data_end_ ) {
575 
576  // Files might be missing the line break at the end of the last line.
577  // We catch this case here, so that we can be sure that the next conditions
578  // are actually valid when accessing the buffer.
579  // But we don't need to do anything in this case.
580 
581  } else if( buffer_[ data_pos_ ] == '\n' ) {
582  ++data_pos_;
583 
584  } else if( buffer_[ data_pos_ ] == '\r' ) {
585  ++data_pos_;
586 
587  // Treat stupid Windows \r\n lines breaks.
588  // We already moved past the \r, so check the next char.
589  if( data_pos_ < data_end_ && buffer_[ data_pos_ ] == '\n' ) {
590  ++data_pos_;
591  }
592  } else {
593  // We need to have checked all cases where this function is called from already.
594  // So this should not happen.
595  assert( false );
596  }
597 
598  // Set char and counters. It checks for end of the file,
599  // so this is safe if we are past the end already.
600  set_current_char_();
601  ++line_;
602  column_ = 1;
603 }
604 
605 // =================================================================================================
606 // Buffer Access
607 // =================================================================================================
608 
610 {
611  // Safety first! We do a single check here, so that in the default case,
612  // we only branch once - assuming that the compiler doesn't optimize that even better anway.
613  if( data_pos_ + n >= data_end_ ) {
614  if( data_pos_ + n == data_end_ ) {
615  // Lazy approach to make sure that all functions are called as expected
616  // when reaching the end of the input data.
617  assert( data_pos_ < data_end_ );
618  assert( n > 0 );
619  data_pos_ += n - 1;
620  column_ += n - 1;
621  advance();
622  return;
623  }
624 
625  // We try to jump past the end
626  assert( data_pos_ + n > data_end_ );
627  throw std::runtime_error(
628  "Invalid InputStream jump to position after buffer end."
629  );
630  }
631 
632  // Update the position as neeeded.
633  data_pos_ += n;
634  column_ += n;
635  if( data_pos_ >= BlockLength ) {
636  update_blocks_();
637  }
638  set_current_char_();
639 }
640 
641 // =================================================================================================
642 // Internal Members
643 // =================================================================================================
644 
645 // -------------------------------------------------------------------------
646 // init_
647 // -------------------------------------------------------------------------
648 
649 void InputStream::init_( std::shared_ptr<BaseInputSource> input_source )
650 {
651  // Set to empty defaults if there is no input.
652  if( input_source == nullptr ) {
653  source_name_ = "invalid source";
654 
655  buffer_ = nullptr;
656  data_pos_ = 0;
657  data_end_ = 0;
658 
659  current_ = '\0';
660  line_ = 0;
661  column_ = 0;
662  return;
663  }
664 
665  // We use three buffer blocks: The first two for the current block/line.
666  // The max line length is one buffer length, so the beginning of the line is always
667  // in the first block, while its end can reach into the second block, but never exeed it.
668  // The third block is for the async reading.
669  buffer_ = new char[ 3 * BlockLength ];
670 
671  try {
672  // Set source name.
673  source_name_ = input_source->source_name();
674 
675  // Read up to two blocks.
676  data_pos_ = 0;
677  data_end_ = input_source->read( buffer_, 2 * BlockLength );
678 
679  // Skip UTF-8 BOM, if found.
680  if( data_end_ >= 3 &&
681  buffer_[0] == '\xEF' &&
682  buffer_[1] == '\xBB' &&
683  buffer_[2] == '\xBF'
684  ) {
685  data_pos_ = 3;
686  }
687 
688  // If there was no data, set to "empty" values.
689  if( data_pos_ == data_end_ ) {
690  reset_();
691 
692  // If there is data, set char value.
693  } else {
694  set_current_char_();
695  }
696 
697  // If there is more data after the two blocks that we just read, start the
698  // reading process (possibly async).
699  if( data_end_ == 2 * BlockLength ) {
700 
701  // Create the reader. We need to do this explictily,
702  // as we use a unique ptr to make this class movable.
703  input_reader_ = utils::make_unique<InputReader>( input_source );
704  input_reader_->start_reading( buffer_ + 2 * BlockLength, BlockLength );
705  }
706 
707  } catch( ... ) {
708  delete[] buffer_;
709  throw;
710  }
711 }
712 
713 // -------------------------------------------------------------------------
714 // update_blocks_
715 // -------------------------------------------------------------------------
716 
717 void InputStream::update_blocks_()
718 {
719  // This function is only called locally in contexts where we already know that we need to
720  // update the blocks. We only assert this here again, meaning that we expect the caller
721  // functions to check for this already. Handling it this way ensures that the function
722  // jump is only made when necesary.
723  assert( data_pos_ >= BlockLength );
724 
725  // Furthermore, the callers need to check the following condition. So, if it breaks, this
726  // function is invalidly called from somewhere else.
727  assert( data_pos_ < data_end_ );
728 
729  // If this assertion breaks, someone tempered with our internal invariants.
730  assert( data_end_ <= BlockLength * 2 );
731 
732  // Move the second to the first block.
733  std::memcpy( buffer_, buffer_ + BlockLength, BlockLength );
734  data_pos_ -= BlockLength;
735  data_end_ -= BlockLength;
736 
737  // If we are not yet at the end of the data, start the reader again:
738  // Copy the third block to the second, and read into the third one.
739  if( input_reader_ && input_reader_->valid() ) {
740  data_end_ += input_reader_->finish_reading();
741  std::memcpy( buffer_ + BlockLength, buffer_ + 2 * BlockLength, BlockLength );
742  input_reader_->start_reading( buffer_ + 2 * BlockLength, BlockLength );
743  }
744 
745  // After the update, the current position needs to be within the first block.
746  assert( data_pos_ < BlockLength );
747 }
748 
749 // -------------------------------------------------------------------------
750 // set_current_char_
751 // -------------------------------------------------------------------------
752 
753 void InputStream::set_current_char_()
754 {
755  // Check end of stream conditions.
756  if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {
757  // We do not expect to overshoot. Let's assert this, but if it still happens
758  // (in release build), we can also cope, and will just set \0 as the current char.
759  assert( data_pos_ == data_end_ );
760 
761  if( data_pos_ == data_end_ && data_pos_ > 0 && buffer_[ data_pos_ - 1 ] != '\n' ) {
762  // If this is the end of the data, but there was no closing \n, add one.
763  buffer_[ data_pos_ ] = '\n';
764  ++data_end_;
765  } else {
766  // If we reached the end, do not fully reset the line and column counters.
767  // They might be needed in some parser.
768  current_ = '\0';
769  return;
770  }
771  }
772 
773  // Treat stupid Windows and Mac lines breaks. Set them to \n, so that downstream parsers
774  // don't have to deal with this.
775  if( buffer_[ data_pos_ ] == '\r' ) {
776  buffer_[ data_pos_ ] = '\n';
777 
778  // If this is a Win line break \r\n, skip one of them, so that only a single \n
779  // is visible to the outside. We do not treat \n\r line breaks properly here!
780  // If any system still uses those, we'd have to change code here.
781  if( data_pos_ + 1 < data_end_ && buffer_[ data_pos_ + 1 ] == '\n' ) {
782  ++data_pos_;
783  }
784  }
785 
786  // Set the char.
787  current_ = buffer_[ data_pos_ ];
788 }
789 
790 } // namespace utils
791 } // namespace genesis
genesis::utils::InputStream::at
std::string at() const
Return a textual representation of the current input position in the form "line:column".
Definition: input_stream.hpp:437
genesis::utils::InputStream::read_char_or_throw
char read_char_or_throw(char const criterion)
Lexing function that reads a single char from the stream and checks whether it equals the provided on...
Definition: input_stream.cpp:89
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:478
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
genesis::utils::InputStream::operator++
self_type & operator++()
Move to the next char in the stream. Shortcut for advance().
Definition: input_stream.hpp:196
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
input_stream.hpp
genesis::utils::InputStream::operator=
self_type & operator=(self_type const &)=delete
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::to_string_byte_format
std::string to_string_byte_format(size_t value)
Produce a human readable formatting of a size in bytes, using the appropriate suffix.
Definition: string.cpp:1047
genesis::utils::InputStream::get_line
std::string get_line()
Read the current line and move to the beginning of the next.
Definition: input_stream.hpp:284
genesis::utils::InputStream::BlockLength
static const size_t BlockLength
Block length for internal buffering.
Definition: input_stream.hpp:101
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
GENESIS_UNLIKELY
#define GENESIS_UNLIKELY
Definition: std.hpp:67
genesis::utils::InputStream::advance
self_type & advance()
Move to the next char in the stream and advance the counters.
Definition: input_stream.hpp:187
genesis::utils::InputStream::good
bool good() const
Return true iff the input is good (not end of data) and can be read from.
Definition: input_stream.hpp:445
genesis::utils::InputStream::jump_unchecked
void jump_unchecked(size_t n)
Jump forward in the stream by a certain amount of chars.
Definition: input_stream.cpp:609