A library for working with phylogenetic and population genetic data.
v0.32.0
sync_reader.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
40 
41 #include <cassert>
42 #include <cstdint>
43 #include <stdexcept>
44 
45 namespace genesis {
46 namespace population {
47 
48 // =================================================================================================
49 // Reading & Parsing
50 // =================================================================================================
51 
52 // -------------------------------------------------------------------------
53 // read_header
54 // -------------------------------------------------------------------------
55 
56 std::vector<std::string> SyncReader::read_header(
57  utils::InputStream& input_stream
58 ) const {
59  // Check that there is a header; if not, we just leave again, without any sample names.
60  std::vector<std::string> result;
61  auto& it = input_stream;
62  if( !it || *it != '#' ) {
63  return result;
64  }
65 
66  // Move to the content, potentially skipping a tab there.
67  ++it;
68  if( it && *it == '\t' ) {
69  ++it;
70  }
71 
72  // Now we can read the rest of the line, and for simplicity just split later.
73  // Check that the fixed columns are there as expected.
74  result = utils::split( it.get_line() );
75  if( result.size() < 3 || result[0] != "chr" || result[1] != "pos" || result[2] != "ref" ) {
76  throw std::runtime_error(
77  "Malformed sync " + it.source_name() + ": Header row provided (starting with '#'), " +
78  "but the first three entries are not \"chr\", \"pos\", \"ref\"."
79  );
80  }
81  result.erase( result.begin(), result.begin() + 3 );
82  return result;
83 }
84 
85 std::vector<std::string> SyncReader::read_header(
86  utils::InputStream& input_stream,
87  std::vector<bool> const& sample_filter
88 ) const {
89  // The header only has to be read once, so we do not need to be overly efficient here.
90  // Simply call the unfiltered function, and then subset later;
91  // no need to replicate the parsing code here.
92  std::vector<std::string> result;
93  auto all_sample_names = read_header( input_stream );
94  if( all_sample_names.empty() ) {
95  return result;
96  }
97 
98  // Now subset to the entries that we actually want.
99  if( sample_filter.size() != all_sample_names.size() ) {
100  throw std::runtime_error(
101  "Malformed sync " + input_stream.source_name() +
102  ": Number of sample names in header (" +
103  std::to_string( all_sample_names.size() ) +
104  ") does not match number of samples in filter (" +
105  std::to_string( sample_filter.size() ) + ")"
106  );
107  }
108  for( size_t i = 0; i < sample_filter.size(); ++i ) {
109  if( sample_filter[i] ) {
110  result.push_back( all_sample_names[i] );
111  }
112  }
113  return result;
114 }
115 
116 // -------------------------------------------------------------------------
117 // read
118 // -------------------------------------------------------------------------
119 
120 std::vector<Variant> SyncReader::read(
121  std::shared_ptr< utils::BaseInputSource > source
122 ) const {
123  std::vector<Variant> result;
124  utils::InputStream it( source );
125 
126  // Potentially read the header (and discard it, as our Variant does not store sample names).
127  read_header( it );
128 
129  // Read until end of input, pushing copies into the result
130  // (moving would not reduce the number of times that we need to allocate memory here).
131  Variant variant;
132  while( parse_line_( it, variant, {}, false )) {
133  result.push_back( variant );
134  }
135  return result;
136 }
137 
138 std::vector<Variant> SyncReader::read(
139  std::shared_ptr< utils::BaseInputSource > source,
140  std::vector<bool> const& sample_filter
141 ) const {
142  std::vector<Variant> result;
143  utils::InputStream it( source );
144 
145  // Potentially read the header (and discard it, as our Variant does not store sample names).
146  read_header( it, sample_filter );
147 
148  // Read until end of input
149  Variant variant;
150  while( parse_line_( it, variant, sample_filter, true )) {
151  result.push_back( variant );
152  }
153  return result;
154 }
155 
156 // -------------------------------------------------------------------------
157 // parse_line
158 // -------------------------------------------------------------------------
159 
161  utils::InputStream& input_stream,
162  Variant& variant
163 ) const {
164  return parse_line_( input_stream, variant, {}, false );
165 }
166 
168  utils::InputStream& input_stream,
169  Variant& variant,
170  std::vector<bool> const& sample_filter
171 ) const {
172  return parse_line_( input_stream, variant, sample_filter, true );
173 }
174 
175 // =================================================================================================
176 // Internal Parsing
177 // =================================================================================================
178 
179 // -------------------------------------------------------------------------
180 // parse_line_
181 // -------------------------------------------------------------------------
182 
183 bool SyncReader::parse_line_(
184  utils::InputStream& input_stream,
185  Variant& variant,
186  std::vector<bool> const& sample_filter,
187  bool use_sample_filter
188 ) const {
189  using namespace genesis::utils;
190 
191  // Shorthand.
192  auto& it = input_stream;
193  if( !it ) {
194  variant = Variant{};
195  return false;
196  }
197 
198  // Read fixed columns for chromosome and position.
199  variant.chromosome = utils::read_until( it, []( char c ){ return c == '\t' || c == '\n'; });
200  if( variant.chromosome.empty() ) {
201  throw std::runtime_error(
202  "Malformed sync " + it.source_name() + " at " + it.at() +
203  ": empty chromosome name"
204  );
205  }
206  it.read_char_or_throw( '\t' );
207  variant.position = parse_unsigned_integer<size_t>( it );
208  if( variant.position == 0 ) {
209  throw std::runtime_error(
210  "Malformed sync " + it.source_name() + " at " + it.at() +
211  ": chromosome position == 0"
212  );
213  }
214  it.read_char_or_throw( '\t' );
215  if( !it || *it == '\n' ) {
216  throw std::runtime_error(
217  std::string("In ") + it.source_name() + ": Unexpected end of line at " + it.at()
218  );
219  }
220 
221  // Read and check fixed column for the reference base.
222  auto const rb = to_upper( *it );
223  if( ! is_valid_base_or_n( rb ) && rb != '.' && rb != '*' ) {
224  throw std::runtime_error(
225  std::string("In ") + it.source_name() + ": Invalid reference base char " +
226  char_to_hex(rb) + " at " + it.at()
227  );
228  }
229  variant.reference_base = rb;
230  ++it;
231 
232  // Read the samples. We switch once for the first line, and thereafter check that we read the
233  // same number of samples each time.
234  size_t src_index = 0;
235  if( variant.samples.empty() ) {
236  while( it && *it != '\n' ) {
237  if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
238  variant.samples.emplace_back();
239  parse_sample_( it, variant.samples.back() );
240  } else {
241  skip_sample_( it );
242  }
243  ++src_index;
244  }
245  } else {
246  // Here we need two indices, one over the samples in the file (source),
247  // and one for the samples that we are writing in our Variant (destination).
248  size_t dst_index = 0;
249  while( it && *it != '\n' ) {
250  // Parse or skip, depending on filter.
251  if(
252  ( ! use_sample_filter ) ||
253  ( src_index < sample_filter.size() && sample_filter[src_index] )
254  ) {
255  // If the numbers do not match, go straight to the error check and throw.
256  if( dst_index >= variant.samples.size() ) {
257  break;
258  }
259  assert( dst_index < variant.samples.size() );
260  parse_sample_( it, variant.samples[dst_index] );
261  ++dst_index;
262  } else {
263  skip_sample_( it );
264  }
265  ++src_index;
266  }
267 
268  // Need to have the exact size of samples in the line.
269  if( dst_index != variant.samples.size() ) {
270  throw std::runtime_error(
271  "Malformed sync " + it.source_name() + " at " + it.at() +
272  ": Line with different number of samples. Expecting " +
273  std::to_string( variant.samples.size() ) + " samples based on previous lines, "
274  "but found " + std::to_string( dst_index ) + " (non-filtered) samples."
275  );
276  }
277  }
278  if( use_sample_filter && src_index != sample_filter.size() ) {
279  throw std::runtime_error(
280  "Malformed sync " + it.source_name() + " at " + it.at() +
281  ": Number of samples in the line (" + std::to_string( src_index ) +
282  ") does not match the number of filter entries (" +
283  std::to_string( sample_filter.size() ) + ")."
284  );
285  }
286 
287  // Sync does not have alt bases, so try to get one based on counts.
288  // Excluding the ref base, we use the base of the remaining three that has the highest total
289  // count across all samples, unless all of them are zero, in which case we do not set the
290  // alt base. We also skip cases where the ref is not in ACGT, as then alt is also meaningless.
291  if( guess_alt_base_ ) {
292  variant.alternative_base = guess_alternative_base( variant, true );
293  } else {
294  variant.alternative_base = 'N';
295  }
296 
297  // Set the status of the Variant. If all samples are missing, so is this Variant.
298  variant.status.reset();
299  size_t missing_count = 0;
300  for( auto const& sample : variant.samples ) {
301  if( sample.status.is( SampleCountsFilterTag::kMissing )) {
302  ++missing_count;
303  }
304  }
305  if( missing_count == variant.samples.size() ) {
307  }
308 
309  // We reched the end of the line or of the whole input.
310  // Move to the beginning of the next line.
311  assert( !it || *it == '\n' );
312  ++it;
313  return true;
314 }
315 
316 // -------------------------------------------------------------------------
317 // parse_sample_gcc_intrinsic_
318 // -------------------------------------------------------------------------
319 
320 // Only use intrinsics version for the compilers that support them!
321 #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
322 
323 void SyncReader::parse_sample_gcc_intrinsic_(
324  utils::InputStream& input_stream,
325  SampleCounts& sample
326 ) const {
327  using namespace genesis::utils;
328  auto& it = input_stream;
329  auto const buff = it.buffer();
330 
331  // We can only run this function if the buffer is guaranteed to contain at least 6 integers
332  // of the largest size that we can process here (8 bytes in bulk, with 7 of them for the digits,
333  // and one for the delimiter). If the buffer is smaller, because we are near
334  // the end of the file, we switch to the slow function instead.
335  // This check is conservative, as in most cases, we won't have numbers that long in the data.
336  // But for those last few entries in a large file, this does not really matter, so let's play
337  // it safe!
338  if( buff.second < 6 * 8 ) {
339  parse_sample_simple_( it, sample );
340  return;
341  }
342 
343  // This function adapts a lot of the ideas from our
344  // InputStream::parse_unsigned_integer_intrinsic_() function. See there for details on the
345  // techniques being used here. We here only provide shortened comments on the bit tricks.
346 
347  // We define a chunk to represent one count number, ACGT and N and D (deletions),
348  // which starts with a chunk that contains 8 bytes from the input stream, which will then be
349  // shortened and processed as needed to only contain the actual digits, and is then finally
350  // turned into the integer representation, step by step.
351  struct Chunk
352  {
353  // The data to process
354  uint64_t data = 0;
355 
356  // How many bytes are actually digits? This is stored as the number of digits plus one.
357  size_t length = 0;
358 
359  // Where in the buffer does this chunk (this sequence of digits) start?
360  size_t offset = 0;
361  };
362 
363  // Function to get a chunk, that is one set of chars representing a number. We here get 8 byte
364  // in bulk, and later check that those contain the delimiter to the next number or the end of
365  // the sample.
366  // If not, we only find out at the end, after having done all the parsing work, and will thus
367  // have wasted timed, but this only ever occurrs in cases with counts >= 10,000,000 (more than
368  // 7 digits, so that there is no delimiter within the 8 bytes), which should be rare in practice,
369  // and in which case we can live with the waste.
370  auto get_chunk_ = []( char const* buffer, size_t offset )
371  {
372  // Prepare a new chunk and store its offset.
373  Chunk chunk;
374  chunk.offset = offset;
375 
376  // Copy 8 bytes into the chunk that we process as one unit.
377  std::memcpy( &chunk.data, buffer + offset, sizeof( chunk.data ));
378 
379  // Helper macro functions to check whether a word has bytes that are less than or greater
380  // than some specified value, and mark these bytes.
381  // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
382  // http://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord
383  auto const zero = static_cast<uint64_t>(0);
384  #define genesis_sync_reader_hasless(x,n) (((x)-~zero/255*(n))&~(x)&~zero/255*128)
385  #define genesis_sync_reader_hasmore(x,n) ((((x)+~zero/255*(127-(n)))|(x))&~zero/255*128)
386 
387  // Get all positions that are not digits, by marking a bit in their respective byte.
388  auto const l = genesis_sync_reader_hasless( chunk.data, '0' );
389  auto const m = genesis_sync_reader_hasmore( chunk.data, '9' );
390  auto const p = l | m;
391 
392  #undef genesis_sync_reader_hasless
393  #undef genesis_sync_reader_hasmore
394 
395  // Find the index of the first byte that is not a digit.
396  // The length is stored plus one here, due to how __builtin_ffs works. We need this later
397  // to check for the edge case (no delimiter found - the word contains only digits).
398  if( sizeof(int) == sizeof(std::uint64_t) ) {
399  chunk.length = __builtin_ffs(p) / 8;
400  } else if( sizeof(long) == sizeof(std::uint64_t) ) {
401  chunk.length = __builtin_ffsl(p) / 8;
402  } else if( sizeof(long long) == sizeof(std::uint64_t) ) {
403  chunk.length = __builtin_ffsll(p) / 8;
404  } else {
405  static_assert(
406  ( sizeof(int) == sizeof(std::uint64_t) ) ||
407  ( sizeof(long) == sizeof(std::uint64_t) ) ||
408  ( sizeof(long long) == sizeof(std::uint64_t) ),
409  "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
410  );
411  throw std::runtime_error(
412  "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
413  );
414  }
415 
416  return chunk;
417  };
418 
419  // Do the minimal amount of work that is necessary to get all chunks into position.
420  // That is, in this part, one line depends on the output of the previous, as we have to move
421  // forward in the buffer depending on how many digits we found.
422  // By doing the minimal work here (that is, not yet unpacking the chunk data into actual
423  // integers), we can maximize the CPU pipeline parallel part later (that does the unpacking).
424  // We start with offset 1, to skip the inital tab. We check later that there actually is a tab.
425  // The allele frequencies are stored in the order `A:T:C:G:N:del`,
426  // see https://sourceforge.net/p/popoolation2/wiki/Tutorial/
427  auto a_chunk = get_chunk_( buff.first, 1 );
428  auto t_chunk = get_chunk_( buff.first, a_chunk.offset + a_chunk.length );
429  auto c_chunk = get_chunk_( buff.first, t_chunk.offset + t_chunk.length );
430  auto g_chunk = get_chunk_( buff.first, c_chunk.offset + c_chunk.length );
431  auto n_chunk = get_chunk_( buff.first, g_chunk.offset + g_chunk.length );
432  auto d_chunk = get_chunk_( buff.first, n_chunk.offset + n_chunk.length );
433 
434  // This has to follow from the logic of the above.
435  assert( a_chunk.offset == 1 );
436  assert( t_chunk.offset == a_chunk.offset + a_chunk.length );
437  assert( c_chunk.offset == t_chunk.offset + t_chunk.length );
438  assert( g_chunk.offset == c_chunk.offset + c_chunk.length );
439  assert( n_chunk.offset == g_chunk.offset + g_chunk.length );
440  assert( d_chunk.offset == n_chunk.offset + n_chunk.length );
441 
442  // Function to process a chunk, that is, one number that is meant to be a count in the file.
443  // See InputStream::parse_unsigned_integer_intrinsic_() for details.
444  auto process_chunk_ = []( Chunk& chunk )
445  {
446  // We need to move the actual data chars that we want to parse to the left-most
447  // position for the following code to work.
448  chunk.data <<= (8 * ( 8 - chunk.length + 1 ));
449 
450  // 1-byte mask trick (works on 4 pairs of single digits)
451  std::uint64_t lower_digits = (chunk.data & 0x0f000f000f000f00) >> 8;
452  std::uint64_t upper_digits = (chunk.data & 0x000f000f000f000f) * 10;
453  chunk.data = lower_digits + upper_digits;
454 
455  // 2-byte mask trick (works on 2 pairs of two digits)
456  lower_digits = (chunk.data & 0x00ff000000ff0000) >> 16;
457  upper_digits = (chunk.data & 0x000000ff000000ff) * 100;
458  chunk.data = lower_digits + upper_digits;
459 
460  // 4-byte mask trick (works on pair of four digits)
461  lower_digits = (chunk.data & 0x0000ffff00000000) >> 32;
462  upper_digits = (chunk.data & 0x000000000000ffff) * 10000;
463  chunk.data = lower_digits + upper_digits;
464 
465  // Check that we got at least one digit, and at most 7.
466  // When there was no digit at all, the first char is a non-digit, meaning that length == 1.
467  // When there were only digits (all 8 bytes), __builtin_ffs returned length == 0 instead,
468  // as then no delimiter was found at all. Check both cases at once, by length > 1.
469  // Also, we assert that the intrinsic did not return anything too large.
470  assert( chunk.length <= 8 );
471  return ( chunk.length > 1 );
472  };
473 
474  // Now do the bulk processing, using CPU-level pipeline parallelization by offering all
475  // chunks to the CPU at once, with no dependencies between them. Any reasonable compiler will
476  // make use of this fact. This gives ~25% speedup in our tests compared to the already fast
477  // InputStream::parse_unsigned_integer_intrinsic_() function.
478  bool good = true;
479  good &= process_chunk_( a_chunk );
480  good &= process_chunk_( t_chunk );
481  good &= process_chunk_( c_chunk );
482  good &= process_chunk_( g_chunk );
483  good &= process_chunk_( n_chunk );
484  good &= process_chunk_( d_chunk );
485 
486  // We have now processed all chunk data, which now contain the actual numbers.
487  sample.a_count = a_chunk.data;
488  sample.t_count = t_chunk.data;
489  sample.c_count = c_chunk.data;
490  sample.g_count = g_chunk.data;
491  sample.n_count = n_chunk.data;
492  sample.d_count = d_chunk.data;
493 
494  // At the end do the error check, so that we are not wasting cycles to wait for the result
495  // of this check in the standard (non-error) case first. If this fails, no problem, we have
496  // not yet moved in the buffer, so just run the slow version on the same data again,
497  // to get proper parsing (for cases with more than 7 digits) or proper error reporting.
498  // We here check that the sample was delimited by a tab, that all number conversions were good
499  // (that is, they contained at least one digit, and at most 7), and were all delimited by colons.
500  // We also already asserted above that all offsets are at least 1, so that the subtraction
501  // of 1 here works without wrapping around the unsigned int.
502  if(
503  ( *it != '\t' ) ||
504  ! good ||
505  buff.first[ t_chunk.offset - 1 ] != ':' ||
506  buff.first[ c_chunk.offset - 1 ] != ':' ||
507  buff.first[ g_chunk.offset - 1 ] != ':' ||
508  buff.first[ n_chunk.offset - 1 ] != ':' ||
509  buff.first[ d_chunk.offset - 1 ] != ':'
510  ) {
511  // Repeat slowly to throw error at the correct position.
512  parse_sample_simple_( it, sample );
513  return;
514  }
515 
516  // If we are here, we have read a full sample with no error. This means that there were at least
517  // 6 digits, 5 colons, and the inital tab, so 12 chars in total that we jump.
518  assert( d_chunk.offset + d_chunk.length - 1 >= 12 );
519 
520  // Also, just because we can, assert all offsets...
521  assert( a_chunk.offset == 1 );
522  assert( t_chunk.offset >= 3 );
523  assert( c_chunk.offset >= 5 );
524  assert( g_chunk.offset >= 7 );
525  assert( n_chunk.offset >= 9 );
526  assert( d_chunk.offset >= 11 );
527 
528  // ...and lengths. Again, lengths are plus one, due to how __builtin_ffs works.
529  assert( a_chunk.length >= 2 );
530  assert( t_chunk.length >= 2 );
531  assert( c_chunk.length >= 2 );
532  assert( g_chunk.length >= 2 );
533  assert( n_chunk.length >= 2 );
534  assert( d_chunk.length >= 2 );
535 
536  // We can only process data with 7 or fewer digits. Let's assert this.
537  // Potential ways this could fail are, e.g., if somehome we produced random data or max int by
538  // subtracting one from zero, or accessed uninitialized memory, or some other horrible error.
539  assert( sample.a_count < 10000000 );
540  assert( sample.t_count < 10000000 );
541  assert( sample.c_count < 10000000 );
542  assert( sample.g_count < 10000000 );
543  assert( sample.n_count < 10000000 );
544  assert( sample.d_count < 10000000 );
545 
546  // Jump to the position after the last entry.
547  it.jump_unchecked( d_chunk.offset + d_chunk.length - 1 );
548 }
549 
550 #endif // defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
551 
552 // -------------------------------------------------------------------------
553 // parse_sample_simple_
554 // -------------------------------------------------------------------------
555 
556 void SyncReader::parse_sample_simple_(
557  utils::InputStream& input_stream,
558  SampleCounts& sample
559 ) const {
560  using namespace genesis::utils;
561  auto& it = input_stream;
562  it.read_char_or_throw( '\t' );
563 
564  // The allele frequencies are stored in the order `A:T:C:G:N:del`,
565  // see https://sourceforge.net/p/popoolation2/wiki/Tutorial/
566  sample.a_count = parse_unsigned_integer<size_t>( it );
567  it.read_char_or_throw( ':' );
568  sample.t_count = parse_unsigned_integer<size_t>( it );
569  it.read_char_or_throw( ':' );
570  sample.c_count = parse_unsigned_integer<size_t>( it );
571  it.read_char_or_throw( ':' );
572  sample.g_count = parse_unsigned_integer<size_t>( it );
573  it.read_char_or_throw( ':' );
574  sample.n_count = parse_unsigned_integer<size_t>( it );
575  it.read_char_or_throw( ':' );
576  sample.d_count = parse_unsigned_integer<size_t>( it );
577 }
578 
579 // -------------------------------------------------------------------------
580 // parse_sample_
581 // -------------------------------------------------------------------------
582 
583 void SyncReader::parse_sample_(
584  utils::InputStream& input_stream,
585  SampleCounts& sample
586 ) const {
587  using namespace genesis::utils;
588  auto& it = input_stream;
589  auto const buff = it.buffer();
590 
591  // Reset the filter status of the sample, in case it was set to not passing previously.
592  sample.status.reset();
593 
594  // We have two special cases that we want to check: all single digits (in which case we can
595  // speed up the parsing by a lot!), and the missing data annotation format of Kapun.
596  // Both consist of the ?:?:?:?:?:? pattern with single characters in between the colons.
597  // We here check that part first, and then continue checking for the two special cases,
598  // in order to avoid having to double check the shared aspects of the pattern.
599  if(
600  buff.second >= 12 &&
601  buff.first[ 0 ] == '\t' &&
602  buff.first[ 2 ] == ':' &&
603  buff.first[ 4 ] == ':' &&
604  buff.first[ 6 ] == ':' &&
605  buff.first[ 8 ] == ':' &&
606  buff.first[ 10 ] == ':'
607  ) {
608  // We find that almost all entries in real world data are single digits.
609  // Then, an entry has 11 chars: "0:0:6:0:0:0". Use this fact for super-charging the parsing.
610  // We check that all chars are exactly as we expect them. At the end, we only need to check that
611  // at position 12 there is no digit, that is, that the number is done and does not have any more
612  // digits. The check whether that char is valid in the context of the file is then done later
613  // in the next parsing step after finishing this function.
614  if(
615  buff.second >= 13 &&
616  is_digit( buff.first[ 1 ] ) &&
617  is_digit( buff.first[ 3 ] ) &&
618  is_digit( buff.first[ 5 ] ) &&
619  is_digit( buff.first[ 7 ] ) &&
620  is_digit( buff.first[ 9 ] ) &&
621  is_digit( buff.first[ 11 ] ) &&
622  ! is_digit( buff.first[ 12 ] )
623  ) {
624  // Convert single digits from ASCII to their int value.
625  sample.a_count = buff.first[ 1 ] - '0';
626  sample.t_count = buff.first[ 3 ] - '0';
627  sample.c_count = buff.first[ 5 ] - '0';
628  sample.g_count = buff.first[ 7 ] - '0';
629  sample.n_count = buff.first[ 9 ] - '0';
630  sample.d_count = buff.first[ 11 ] - '0';
631 
632  // Jump to the position after the last entry.
633  it.jump_unchecked( 12 );
634  return;
635  }
636 
637  // If activated, allow the missing site notation `.:.:.:.:.:.` of Kapun.
638  if(
639  allow_missing_ &&
640  buff.first[ 1 ] == '.' &&
641  buff.first[ 3 ] == '.' &&
642  buff.first[ 5 ] == '.' &&
643  buff.first[ 7 ] == '.' &&
644  buff.first[ 9 ] == '.' &&
645  buff.first[ 11 ] == '.'
646  ) {
647  // Set everything to zero and signal zero counts or missing data.
648  sample.a_count = 0;
649  sample.t_count = 0;
650  sample.c_count = 0;
651  sample.g_count = 0;
652  sample.n_count = 0;
653  sample.d_count = 0;
654  sample.status.set( SampleCountsFilterTag::kMissing );
655 
656  // Jump to the position after the last entry.
657  it.jump_unchecked( 12 );
658  return;
659  }
660  }
661 
662  // If it's not the simply one-digit format, select the fastest alternative algorithm
663  // available for the given compiler.
664  #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
665 
666  parse_sample_gcc_intrinsic_( it, sample );
667 
668  #else
669 
670  parse_sample_simple_( it, sample );
671 
672  #endif
673 }
674 
675 // -------------------------------------------------------------------------
676 // skip_sample_
677 // -------------------------------------------------------------------------
678 
679 void SyncReader::skip_sample_(
680  utils::InputStream& input_stream
681 ) const {
682  using namespace genesis::utils;
683 
684  // The skip functions are slow, because they need char by char access to the input stream.
685  // Need to fix this at some point. For now, just read into an unused dummy.
686  // Not worth bothering with this too much now, as this is really fast anyway.
687  SampleCounts dummy;
688  parse_sample_( input_stream, dummy );
689 
690  // Simply skip everything.
691  // input_stream.read_char_or_throw( '\t' );
692  // skip_while( input_stream, is_digit );
693  // input_stream.read_char_or_throw( ':' );
694  // skip_while( input_stream, is_digit );
695  // input_stream.read_char_or_throw( ':' );
696  // skip_while( input_stream, is_digit );
697  // input_stream.read_char_or_throw( ':' );
698  // skip_while( input_stream, is_digit );
699  // input_stream.read_char_or_throw( ':' );
700  // skip_while( input_stream, is_digit );
701  // input_stream.read_char_or_throw( ':' );
702  // skip_while( input_stream, is_digit );
703 }
704 
705 } // namespace population
706 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
parser.hpp
functions.hpp
genesis::utils::InputStream::source_name
std::string source_name() const
Get the input source name where this stream reads from.
Definition: input_stream.hpp:478
genesis::population::Variant::position
size_t position
Definition: variant.hpp:74
sync_reader.hpp
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
genesis::population::guess_alternative_base
char guess_alternative_base(Variant const &variant, bool force, SampleCountsFilterPolicy filter_policy)
Guess the alternative base of a Variant.
Definition: population/function/functions.cpp:495
genesis::population::Variant::reference_base
char reference_base
Definition: variant.hpp:79
genesis::population::SyncReader::parse_line
bool parse_line(utils::InputStream &input_stream, Variant &sample_set) const
Read a single line into the provided Variant.
Definition: sync_reader.cpp:160
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::utils
Definition: placement/formats/edge_color.hpp:42
genesis::population::SyncReader::read
std::vector< Variant > read(std::shared_ptr< utils::BaseInputSource > source) const
Read the whole input into a vector of Variants.
Definition: sync_reader.cpp:120
genesis::population::FilterStatus::set
void set(IntType value)
Definition: filter_status.hpp:100
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
sample_counts_filter.hpp
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
string.hpp
Provides some commonly used string utility functions.
genesis::population::is_valid_base_or_n
constexpr bool is_valid_base_or_n(char c)
Return whether a given base is in ACGTN, case insensitive.
Definition: population/function/functions.hpp:71
genesis::utils::read_until
std::string read_until(InputStream &source, char criterion)
Lexing function that reads from the stream until its current char equals the provided one....
Definition: scanner.hpp:254
genesis::population::Variant
A single variant at a position in a chromosome, along with SampleCounts for a set of samples.
Definition: variant.hpp:65
genesis::population::FilterStatus::reset
void reset()
Definition: filter_status.hpp:117
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::is_digit
constexpr bool is_digit(char c) noexcept
Return whether a char is a digit (0-9), ASCII-only.
Definition: char.hpp:95
variant_filter.hpp
char.hpp
genesis::population::VariantFilterTag::kMissing
@ kMissing
Position is missing in the input data.
genesis::population::SyncReader::read_header
std::vector< std::string > read_header(utils::InputStream &input_stream) const
Read the header line, if there is one. Do nothing if there is not.
Definition: sync_reader.cpp:56
genesis::utils::char_to_hex
std::string char_to_hex(char c, bool full)
Return the name and hex representation of a char.
Definition: char.cpp:118
genesis::population::Variant::samples
std::vector< SampleCounts > samples
Definition: variant.hpp:82
scanner.hpp
genesis::population::Variant::alternative_base
char alternative_base
Definition: variant.hpp:80
genesis::population::SampleCountsFilterTag::kMissing
@ kMissing
Position is missing in the input data.
genesis::population::Variant::status
FilterStatus status
Definition: variant.hpp:76
genesis::population::Variant::chromosome
std::string chromosome
Definition: variant.hpp:73