43 namespace population {
54 std::string& cur_chr,
size_t& cur_pos,
61 throw std::runtime_error(
63 ": unordered chromosomes and positions"
70 std::shared_ptr< utils::BaseInputSource > source
72 std::vector<Variant> result;
76 std::string cur_chr =
"";
79 while( parse_line_( it, variant, {}, false )) {
81 result.push_back( std::move( variant ));
88 std::shared_ptr< utils::BaseInputSource > source,
89 std::vector<bool>
const& sample_filter
91 std::vector<Variant> result;
95 std::string cur_chr =
"";
98 while( parse_line_( it, variant, sample_filter,
true )) {
100 result.push_back( std::move( variant ));
110 return parse_line_( input_stream, variant, {}, false );
116 std::vector<bool>
const& sample_filter
118 return parse_line_( input_stream, variant, sample_filter,
true );
125 bool SyncReader::parse_line_(
128 std::vector<bool>
const& sample_filter,
129 bool use_sample_filter
134 auto& it = input_stream;
143 throw std::runtime_error(
144 "Malformed sync " + it.source_name() +
" at " + it.at() +
145 ": empty chromosome name"
148 it.read_char_or_throw(
'\t' );
149 variant.
position = it.parse_unsigned_integer<
size_t>();
151 throw std::runtime_error(
152 "Malformed sync " + it.source_name() +
" at " + it.at() +
153 ": chromosome position == 0"
156 it.read_char_or_throw(
'\t' );
157 if( !it || *it ==
'\n' ) {
158 throw std::runtime_error(
159 std::string(
"In ") + it.source_name() +
": Unexpected end of line at " + it.at()
165 if( rb !=
'A' && rb !=
'C' && rb !=
'G' && rb !=
'T' && rb !=
'N' && rb !=
'.' && rb !=
'*' ) {
166 throw std::runtime_error(
167 std::string(
"In ") + it.source_name() +
": Invalid reference base char " +
176 size_t src_index = 0;
177 if( variant.
samples.empty() ) {
178 while( it && *it !=
'\n' ) {
179 if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
180 variant.
samples.emplace_back();
181 parse_sample_( it, variant.
samples.back() );
190 size_t dst_index = 0;
191 while( it && *it !=
'\n' ) {
193 if( dst_index >= variant.
samples.size() ) {
198 if( !use_sample_filter || ( src_index < sample_filter.size() && sample_filter[src_index] )) {
199 assert( dst_index < variant.
samples.size() );
200 parse_sample_( it, variant.
samples[dst_index] );
209 if( dst_index != variant.
samples.size() ) {
210 throw std::runtime_error(
211 "Malformed sync " + it.source_name() +
" at " + it.at() +
212 ": Line with different number of samples."
216 if( use_sample_filter && src_index != sample_filter.size() ) {
217 throw std::runtime_error(
218 "Malformed sync " + it.source_name() +
" at " + it.at() +
219 ": Number of samples in the line does not match the number of filter entries."
229 assert( !it || *it ==
'\n' );
235 #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
237 void SyncReader::parse_sample_gcc_intrinsic_(
238 utils::InputStream& input_stream,
242 auto& it = input_stream;
243 auto const buff = it.buffer();
252 if( buff.second < 6 * 8 ) {
253 parse_sample_simple_( it, sample );
284 auto get_chunk_ = [](
char const* buffer,
size_t offset )
291 std::memcpy( &chunk.data, buffer +
offset,
sizeof( chunk.data ));
297 auto const zero =
static_cast<uint64_t
>(0);
298 #define hasless(x,n) (((x)-~zero/255*(n))&~(x)&~zero/255*128)
299 #define hasmore(x,n) ((((x)+~zero/255*(127-(n)))|(x))&~zero/255*128)
302 auto const l = hasless( chunk.data,
'0' );
303 auto const m = hasmore( chunk.data,
'9' );
304 auto const p = l | m;
312 if(
sizeof(
int) ==
sizeof(std::uint64_t) ) {
313 chunk.length = __builtin_ffs(p) / 8;
314 }
else if(
sizeof(
long) ==
sizeof(std::uint64_t) ) {
315 chunk.length = __builtin_ffsl(p) / 8;
316 }
else if(
sizeof(
long long) ==
sizeof(std::uint64_t) ) {
317 chunk.length = __builtin_ffsll(p) / 8;
320 (
sizeof(
int) ==
sizeof(std::uint64_t) ) ||
321 (
sizeof(
long) ==
sizeof(std::uint64_t) ) ||
322 (
sizeof(
long long) ==
sizeof(std::uint64_t) ),
323 "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
325 throw std::runtime_error(
326 "No compilter intrinsic __builtin_ffs[l][l] for std::uint64_t"
341 auto a_chunk = get_chunk_( buff.first, 1 );
342 auto t_chunk = get_chunk_( buff.first, a_chunk.offset + a_chunk.length );
343 auto c_chunk = get_chunk_( buff.first, t_chunk.offset + t_chunk.length );
344 auto g_chunk = get_chunk_( buff.first, c_chunk.offset + c_chunk.length );
345 auto n_chunk = get_chunk_( buff.first, g_chunk.offset + g_chunk.length );
346 auto d_chunk = get_chunk_( buff.first, n_chunk.offset + n_chunk.length );
349 assert( a_chunk.offset == 1 );
350 assert( t_chunk.offset == a_chunk.offset + a_chunk.length );
351 assert( c_chunk.offset == t_chunk.offset + t_chunk.length );
352 assert( g_chunk.offset == c_chunk.offset + c_chunk.length );
353 assert( n_chunk.offset == g_chunk.offset + g_chunk.length );
354 assert( d_chunk.offset == n_chunk.offset + n_chunk.length );
358 auto process_chunk_ = []( Chunk& chunk )
362 chunk.data <<= (8 * ( 8 - chunk.length + 1 ));
365 std::uint64_t lower_digits = (chunk.data & 0x0f000f000f000f00) >> 8;
366 std::uint64_t upper_digits = (chunk.data & 0x000f000f000f000f) * 10;
367 chunk.data = lower_digits + upper_digits;
370 lower_digits = (chunk.data & 0x00ff000000ff0000) >> 16;
371 upper_digits = (chunk.data & 0x000000ff000000ff) * 100;
372 chunk.data = lower_digits + upper_digits;
375 lower_digits = (chunk.data & 0x0000ffff00000000) >> 32;
376 upper_digits = (chunk.data & 0x000000000000ffff) * 10000;
377 chunk.data = lower_digits + upper_digits;
384 assert( chunk.length <= 8 );
385 return ( chunk.length > 1 );
393 good &= process_chunk_( a_chunk );
394 good &= process_chunk_( t_chunk );
395 good &= process_chunk_( c_chunk );
396 good &= process_chunk_( g_chunk );
397 good &= process_chunk_( n_chunk );
398 good &= process_chunk_( d_chunk );
401 sample.a_count = a_chunk.data;
402 sample.t_count = t_chunk.data;
403 sample.c_count = c_chunk.data;
404 sample.g_count = g_chunk.data;
405 sample.n_count = n_chunk.data;
406 sample.d_count = d_chunk.data;
419 buff.first[ t_chunk.offset - 1 ] !=
':' ||
420 buff.first[ c_chunk.offset - 1 ] !=
':' ||
421 buff.first[ g_chunk.offset - 1 ] !=
':' ||
422 buff.first[ n_chunk.offset - 1 ] !=
':' ||
423 buff.first[ d_chunk.offset - 1 ] !=
':'
426 parse_sample_simple_( it, sample );
432 assert( d_chunk.offset + d_chunk.length - 1 >= 12 );
435 assert( a_chunk.offset == 1 );
436 assert( t_chunk.offset >= 3 );
437 assert( c_chunk.offset >= 5 );
438 assert( g_chunk.offset >= 7 );
439 assert( n_chunk.offset >= 9 );
440 assert( d_chunk.offset >= 11 );
443 assert( a_chunk.length >= 2 );
444 assert( t_chunk.length >= 2 );
445 assert( c_chunk.length >= 2 );
446 assert( g_chunk.length >= 2 );
447 assert( n_chunk.length >= 2 );
448 assert( d_chunk.length >= 2 );
453 assert( sample.a_count < 10000000 );
454 assert( sample.t_count < 10000000 );
455 assert( sample.c_count < 10000000 );
456 assert( sample.g_count < 10000000 );
457 assert( sample.n_count < 10000000 );
458 assert( sample.d_count < 10000000 );
461 it.jump_unchecked( d_chunk.offset + d_chunk.length - 1 );
464 #endif // defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
466 void SyncReader::parse_sample_simple_(
467 utils::InputStream& input_stream,
471 auto& it = input_stream;
472 it.read_char_or_throw(
'\t' );
476 sample.a_count = it.parse_unsigned_integer<
size_t>();
477 it.read_char_or_throw(
':' );
478 sample.t_count = it.parse_unsigned_integer<
size_t>();
479 it.read_char_or_throw(
':' );
480 sample.c_count = it.parse_unsigned_integer<
size_t>();
481 it.read_char_or_throw(
':' );
482 sample.g_count = it.parse_unsigned_integer<
size_t>();
483 it.read_char_or_throw(
':' );
484 sample.n_count = it.parse_unsigned_integer<
size_t>();
485 it.read_char_or_throw(
':' );
486 sample.d_count = it.parse_unsigned_integer<
size_t>();
489 void SyncReader::parse_sample_(
490 utils::InputStream& input_stream,
494 auto& it = input_stream;
495 auto const buff = it.buffer();
505 buff.first[ 0 ] ==
'\t' &&
506 buff.first[ 2 ] ==
':' &&
507 buff.first[ 4 ] ==
':' &&
508 buff.first[ 6 ] ==
':' &&
509 buff.first[ 8 ] ==
':' &&
510 buff.first[ 10 ] ==
':' &&
520 sample.a_count = buff.first[ 1 ] -
'0';
521 sample.t_count = buff.first[ 3 ] -
'0';
522 sample.c_count = buff.first[ 5 ] -
'0';
523 sample.g_count = buff.first[ 7 ] -
'0';
524 sample.n_count = buff.first[ 9 ] -
'0';
525 sample.d_count = buff.first[ 11 ] -
'0';
528 it.jump_unchecked( 12 );
534 #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)
536 parse_sample_gcc_intrinsic_( it, sample );
540 parse_sample_simple_( it, sample );
545 void SyncReader::skip_sample_(
546 utils::InputStream& input_stream
554 parse_sample_( input_stream, dummy );