46 namespace population {
60 std::vector<std::string> result;
61 auto& it = input_stream;
62 if( !it || *it !=
'#' ) {
68 if( it && *it ==
'\t' ) {
75 if( result.size() < 3 || result[0] !=
"chr" || result[1] !=
"pos" || result[2] !=
"ref" ) {
76 throw std::runtime_error(
77 "Malformed sync " + it.source_name() +
": Header row provided (starting with '#'), " +
78 "but the first three entries are not \"chr\", \"pos\", \"ref\"."
81 result.erase( result.begin(), result.begin() + 3 );
87 std::vector<bool>
const& sample_filter
92 std::vector<std::string> result;
93 auto all_sample_names =
read_header( input_stream );
94 if( all_sample_names.empty() ) {
99 if( sample_filter.size() != all_sample_names.size() ) {
100 throw std::runtime_error(
102 ": Number of sample names in header (" +
104 ") does not match number of samples in filter (" +
108 for(
size_t i = 0; i < sample_filter.size(); ++i ) {
109 if( sample_filter[i] ) {
110 result.push_back( all_sample_names[i] );
121 std::shared_ptr< utils::BaseInputSource > source
123 std::vector<Variant> result;
132 while( parse_line_( it, variant, {}, false )) {
133 result.push_back( variant );
139 std::shared_ptr< utils::BaseInputSource > source,
140 std::vector<bool>
const& sample_filter
142 std::vector<Variant> result;
150 while( parse_line_( it, variant, sample_filter,
true )) {
151 result.push_back( variant );
164 return parse_line_( input_stream, variant, {}, false );
170 std::vector<bool>
const& sample_filter
172 return parse_line_( input_stream, variant, sample_filter,
true );
183 bool SyncReader::parse_line_(
186 std::vector<bool>
const& sample_filter,
187 bool use_sample_filter
192 auto& it = input_stream;
201 throw std::runtime_error(
202 "Malformed sync " + it.source_name() +
" at " + it.at() +
203 ": empty chromosome name"
206 it.read_char_or_throw(
'\t' );
207 variant.
position = parse_unsigned_integer<size_t>( it );
209 throw std::runtime_error(
210 "Malformed sync " + it.source_name() +
" at " + it.at() +
211 ": chromosome position == 0"
214 it.read_char_or_throw(
'\t' );
215 if( !it || *it ==
'\n' ) {
216 throw std::runtime_error(
217 std::string(
"In ") + it.source_name() +
": Unexpected end of line at " + it.at()
224 throw std::runtime_error(
225 std::string(
"In ") + it.source_name() +
": Invalid reference base char " +
234 size_t src_index = 0;
235 if( variant.
samples.empty() ) {
236 while( it && *it !=
'\n' ) {
237 if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
238 variant.
samples.emplace_back();
239 parse_sample_( it, variant.
samples.back() );
248 size_t dst_index = 0;
249 while( it && *it !=
'\n' ) {
252 ( ! use_sample_filter ) ||
253 ( src_index < sample_filter.size() && sample_filter[src_index] )
256 if( dst_index >= variant.
samples.size() ) {
259 assert( dst_index < variant.
samples.size() );
260 parse_sample_( it, variant.
samples[dst_index] );
269 if( dst_index != variant.
samples.size() ) {
270 throw std::runtime_error(
271 "Malformed sync " + it.source_name() +
" at " + it.at() +
272 ": Line with different number of samples. Expecting " +
274 "but found " +
std::to_string( dst_index ) +
" (non-filtered) samples."
278 if( use_sample_filter && src_index != sample_filter.size() ) {
279 throw std::runtime_error(
280 "Malformed sync " + it.source_name() +
" at " + it.at() +
281 ": Number of samples in the line (" +
std::to_string( src_index ) +
282 ") does not match the number of filter entries (" +
291 if( guess_alt_base_ ) {
299 size_t missing_count = 0;
300 for(
auto const& sample : variant.
samples ) {
305 if( missing_count == variant.
samples.size() ) {
311 assert( !it || *it ==
'\n' );
321 #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
323 void SyncReader::parse_sample_gcc_intrinsic_(
324 utils::InputStream& input_stream,
328 auto& it = input_stream;
329 auto const buff = it.buffer();
338 if( buff.second < 6 * 8 ) {
339 parse_sample_simple_( it, sample );
370 auto get_chunk_ = [](
char const* buffer,
size_t offset )
377 std::memcpy( &chunk.data, buffer +
offset,
sizeof( chunk.data ));
383 auto const zero =
static_cast<uint64_t
>(0);
384 #define genesis_sync_reader_hasless(x,n) (((x)-~zero/255*(n))&~(x)&~zero/255*128)
385 #define genesis_sync_reader_hasmore(x,n) ((((x)+~zero/255*(127-(n)))|(x))&~zero/255*128)
388 auto const l = genesis_sync_reader_hasless( chunk.data,
'0' );
389 auto const m = genesis_sync_reader_hasmore( chunk.data,
'9' );
390 auto const p = l | m;
392 #undef genesis_sync_reader_hasless
393 #undef genesis_sync_reader_hasmore
398 if(
sizeof(
int) ==
sizeof(std::uint64_t) ) {
399 chunk.length = __builtin_ffs(p) / 8;
400 }
else if(
sizeof(
long) ==
sizeof(std::uint64_t) ) {
401 chunk.length = __builtin_ffsl(p) / 8;
402 }
else if(
sizeof(
long long) ==
sizeof(std::uint64_t) ) {
403 chunk.length = __builtin_ffsll(p) / 8;
406 (
sizeof(
int) ==
sizeof(std::uint64_t) ) ||
407 (
sizeof(
long) ==
sizeof(std::uint64_t) ) ||
408 (
sizeof(
long long) ==
sizeof(std::uint64_t) ),
409 "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
411 throw std::runtime_error(
412 "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
427 auto a_chunk = get_chunk_( buff.first, 1 );
428 auto t_chunk = get_chunk_( buff.first, a_chunk.offset + a_chunk.length );
429 auto c_chunk = get_chunk_( buff.first, t_chunk.offset + t_chunk.length );
430 auto g_chunk = get_chunk_( buff.first, c_chunk.offset + c_chunk.length );
431 auto n_chunk = get_chunk_( buff.first, g_chunk.offset + g_chunk.length );
432 auto d_chunk = get_chunk_( buff.first, n_chunk.offset + n_chunk.length );
435 assert( a_chunk.offset == 1 );
436 assert( t_chunk.offset == a_chunk.offset + a_chunk.length );
437 assert( c_chunk.offset == t_chunk.offset + t_chunk.length );
438 assert( g_chunk.offset == c_chunk.offset + c_chunk.length );
439 assert( n_chunk.offset == g_chunk.offset + g_chunk.length );
440 assert( d_chunk.offset == n_chunk.offset + n_chunk.length );
444 auto process_chunk_ = []( Chunk& chunk )
448 chunk.data <<= (8 * ( 8 - chunk.length + 1 ));
451 std::uint64_t lower_digits = (chunk.data & 0x0f000f000f000f00) >> 8;
452 std::uint64_t upper_digits = (chunk.data & 0x000f000f000f000f) * 10;
453 chunk.data = lower_digits + upper_digits;
456 lower_digits = (chunk.data & 0x00ff000000ff0000) >> 16;
457 upper_digits = (chunk.data & 0x000000ff000000ff) * 100;
458 chunk.data = lower_digits + upper_digits;
461 lower_digits = (chunk.data & 0x0000ffff00000000) >> 32;
462 upper_digits = (chunk.data & 0x000000000000ffff) * 10000;
463 chunk.data = lower_digits + upper_digits;
470 assert( chunk.length <= 8 );
471 return ( chunk.length > 1 );
479 good &= process_chunk_( a_chunk );
480 good &= process_chunk_( t_chunk );
481 good &= process_chunk_( c_chunk );
482 good &= process_chunk_( g_chunk );
483 good &= process_chunk_( n_chunk );
484 good &= process_chunk_( d_chunk );
487 sample.a_count = a_chunk.data;
488 sample.t_count = t_chunk.data;
489 sample.c_count = c_chunk.data;
490 sample.g_count = g_chunk.data;
491 sample.n_count = n_chunk.data;
492 sample.d_count = d_chunk.data;
505 buff.first[ t_chunk.offset - 1 ] !=
':' ||
506 buff.first[ c_chunk.offset - 1 ] !=
':' ||
507 buff.first[ g_chunk.offset - 1 ] !=
':' ||
508 buff.first[ n_chunk.offset - 1 ] !=
':' ||
509 buff.first[ d_chunk.offset - 1 ] !=
':'
512 parse_sample_simple_( it, sample );
518 assert( d_chunk.offset + d_chunk.length - 1 >= 12 );
521 assert( a_chunk.offset == 1 );
522 assert( t_chunk.offset >= 3 );
523 assert( c_chunk.offset >= 5 );
524 assert( g_chunk.offset >= 7 );
525 assert( n_chunk.offset >= 9 );
526 assert( d_chunk.offset >= 11 );
529 assert( a_chunk.length >= 2 );
530 assert( t_chunk.length >= 2 );
531 assert( c_chunk.length >= 2 );
532 assert( g_chunk.length >= 2 );
533 assert( n_chunk.length >= 2 );
534 assert( d_chunk.length >= 2 );
539 assert( sample.a_count < 10000000 );
540 assert( sample.t_count < 10000000 );
541 assert( sample.c_count < 10000000 );
542 assert( sample.g_count < 10000000 );
543 assert( sample.n_count < 10000000 );
544 assert( sample.d_count < 10000000 );
547 it.jump_unchecked( d_chunk.offset + d_chunk.length - 1 );
550 #endif // defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
556 void SyncReader::parse_sample_simple_(
557 utils::InputStream& input_stream,
561 auto& it = input_stream;
562 it.read_char_or_throw(
'\t' );
566 sample.a_count = parse_unsigned_integer<size_t>( it );
567 it.read_char_or_throw(
':' );
568 sample.t_count = parse_unsigned_integer<size_t>( it );
569 it.read_char_or_throw(
':' );
570 sample.c_count = parse_unsigned_integer<size_t>( it );
571 it.read_char_or_throw(
':' );
572 sample.g_count = parse_unsigned_integer<size_t>( it );
573 it.read_char_or_throw(
':' );
574 sample.n_count = parse_unsigned_integer<size_t>( it );
575 it.read_char_or_throw(
':' );
576 sample.d_count = parse_unsigned_integer<size_t>( it );
583 void SyncReader::parse_sample_(
584 utils::InputStream& input_stream,
588 auto& it = input_stream;
589 auto const buff = it.buffer();
592 sample.status.reset();
601 buff.first[ 0 ] ==
'\t' &&
602 buff.first[ 2 ] ==
':' &&
603 buff.first[ 4 ] ==
':' &&
604 buff.first[ 6 ] ==
':' &&
605 buff.first[ 8 ] ==
':' &&
606 buff.first[ 10 ] ==
':'
625 sample.a_count = buff.first[ 1 ] -
'0';
626 sample.t_count = buff.first[ 3 ] -
'0';
627 sample.c_count = buff.first[ 5 ] -
'0';
628 sample.g_count = buff.first[ 7 ] -
'0';
629 sample.n_count = buff.first[ 9 ] -
'0';
630 sample.d_count = buff.first[ 11 ] -
'0';
633 it.jump_unchecked( 12 );
640 buff.first[ 1 ] ==
'.' &&
641 buff.first[ 3 ] ==
'.' &&
642 buff.first[ 5 ] ==
'.' &&
643 buff.first[ 7 ] ==
'.' &&
644 buff.first[ 9 ] ==
'.' &&
645 buff.first[ 11 ] ==
'.'
657 it.jump_unchecked( 12 );
664 #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
666 parse_sample_gcc_intrinsic_( it, sample );
670 parse_sample_simple_( it, sample );
679 void SyncReader::skip_sample_(
680 utils::InputStream& input_stream
688 parse_sample_( input_stream, dummy );