52 namespace population {
67 std::vector<std::string> result;
68 result.resize( header_info_.sample_infos.size() );
69 for(
auto const& sample_info : header_info_.sample_infos ) {
70 assert( sample_info.second.index < result.size() );
71 result[ sample_info.second.index ] = sample_info.first;
80 void FrequencyTableInputStream::Iterator::parse_header_()
87 input_stream_->get_line(), parent_->separator_char_,
false
92 std::unordered_set<std::string> all_samplenames;
98 for(
auto const& field : header_fields ) {
100 if( field.empty() || ! std::all_of( field.begin(), field.end(),
utils::is_graph )) {
101 throw std::runtime_error(
102 "Invalid frequency table with non-graph characters or empty field in header."
107 parse_header_field_( field, all_samplenames );
112 if( column_processors_.size() != header_fields.size() ) {
113 throw std::domain_error(
114 "Internal error: Number of column processors does not match number of columns"
120 check_header_fields_( all_samplenames );
125 assert( current_variant_->reference_base ==
'N' );
126 assert( current_variant_->alternative_base ==
'N' );
130 sample_data_->resize( header_info_.sample_infos.size() );
131 current_variant_->samples.resize( header_info_.sample_infos.size() );
138 void FrequencyTableInputStream::Iterator::check_header_fields_(
139 std::unordered_set<std::string>
const& all_samplenames
143 if( ! header_info_.has_chr ) {
144 throw std::runtime_error(
145 "Invalid frequency table that does not contain a chromosome column"
148 if( ! header_info_.has_pos ) {
149 throw std::runtime_error(
150 "Invalid frequency table that does not contain a position column"
157 for(
auto const& sample_info_entry : header_info_.sample_infos ) {
158 auto const& sample_info = sample_info_entry.second;
162 bool good = sample_info.has_frq;
163 good |= ( sample_info.has_ref && sample_info.has_alt );
164 good |= ( sample_info.has_cov && ( sample_info.has_ref || sample_info.has_alt ));
166 throw std::runtime_error(
167 "Frequency table sample \"" + sample_info_entry.first +
"\" does not contain enough "
168 "information to compute allele counts."
177 int sample_flags = 0;
178 for(
auto const& sample_info_entry : header_info_.sample_infos ) {
179 auto const& sample_info = sample_info_entry.second;
184 flag += sample_info.has_ref ? 1 : 0;
185 flag += sample_info.has_alt ? 2 : 0;
186 flag += sample_info.has_frq ? 4 : 0;
187 flag += sample_info.has_cov ? 8 : 0;
189 if( sample_flags == 0 ) {
194 if( sample_flags != flag ) {
195 LOG_WARN <<
"Frequency table samples contain different types of data "
196 <<
"(reference or alternative counts, frequencies, or read depth). "
197 <<
"We can handle this, but it might indicate that something went wrong "
198 <<
"when parsing and interpreting the header fields to obtain sample names.";
205 for(
auto const& sn : parent_->sample_names_filter_ ) {
206 if( all_samplenames.count( sn ) == 0 ) {
207 throw std::invalid_argument(
208 "Frequency table header does not contain given sample name filter \"" + sn +
"\"."
222 void FrequencyTableInputStream::Iterator::parse_header_field_(
223 std::string
const& field,
224 std::unordered_set<std::string>& all_samplenames
227 assert( ! field.empty() );
242 matches += evaluate_if_field_is_chr_( field );
243 matches += evaluate_if_field_is_pos_( field );
244 matches += evaluate_if_field_is_ref_( field );
245 matches += evaluate_if_field_is_alt_( field );
246 matches += evaluate_if_field_is_sample_ref_( field, all_samplenames );
247 matches += evaluate_if_field_is_sample_alt_( field, all_samplenames );
248 matches += evaluate_if_field_is_sample_frq_( field, all_samplenames );
249 matches += evaluate_if_field_is_sample_cov_( field, all_samplenames );
254 auto cur_var = current_variant_;
255 auto const sep_char = parent_->separator_char_;
258 it, [sep_char](
char c ){
259 return c ==
'\n' || c == sep_char;
267 throw std::runtime_error(
268 "Cannot read frequency table header, as it contains ambiguous headers. "
269 "Header field name \"" + field +
"\" matches multiple types of data columns."
278 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_chr_(
279 std::string
const& field
282 assert( ! field.empty() );
288 if( ! match_header_field_( field, parent_->usr_chr_name_, parent_->chr_names_ )) {
293 if( header_info_.has_chr ) {
294 throw std::runtime_error(
295 "Cannot unambiguously parse frequency table header, "
296 "as it contains multiple columns for the chromosome."
299 header_info_.has_chr =
true;
303 auto cur_var = current_variant_;
304 auto const sep_char = parent_->separator_char_;
310 it, [sep_char](
char c ){
314 if( cur_var->chromosome.empty() ) {
315 throw std::runtime_error(
316 "Malformed frequency table with empty chromosome name in line " + it.
at()
329 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_pos_(
330 std::string
const& field
336 assert( ! field.empty() );
337 if( ! match_header_field_( field, parent_->usr_pos_name_, parent_->pos_names_ )) {
340 if( header_info_.has_pos ) {
341 throw std::runtime_error(
342 "Cannot unambiguously parse frequency table header, "
343 "as it contains multiple columns for the position."
346 header_info_.has_pos =
true;
347 auto cur_var = current_variant_;
349 cur_var->position = utils::parse_unsigned_integer<size_t>( it );
350 if( cur_var->position == 0 ) {
351 throw std::runtime_error(
352 "Malformed frequency table with position == 0 in line " + it.
at()
363 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_ref_(
364 std::string
const& field
370 assert( ! field.empty() );
371 if( ! match_header_field_( field, parent_->usr_ref_name_, parent_->ref_names_ )) {
374 if( header_info_.has_ref ) {
375 throw std::runtime_error(
376 "Cannot unambiguously parse frequency table header, "
377 "as it contains multiple columns for the reference base."
380 header_info_.has_ref =
true;
381 auto cur_var = current_variant_;
387 throw std::runtime_error(
388 "Malformed frequency table with reference base not in [ACGTN] in line " + it.
at()
391 cur_var->reference_base = b;
401 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_alt_(
402 std::string
const& field
408 assert( ! field.empty() );
409 if( ! match_header_field_( field, parent_->usr_alt_name_, parent_->alt_names_ )) {
412 if( header_info_.has_alt ) {
413 throw std::runtime_error(
414 "Cannot unambiguously parse frequency table header, "
415 "as it contains multiple columns for the alternative base."
418 header_info_.has_alt =
true;
419 auto cur_var = current_variant_;
424 throw std::runtime_error(
425 "Malformed frequency table with alternative base not in [ACGTN] in line " + it.
at()
428 cur_var->alternative_base = b;
438 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_ref_(
439 std::string
const& field,
440 std::unordered_set<std::string>& all_samplenames
449 assert( ! field.empty() );
450 std::string samplename;
451 if( ! match_header_sample_(
452 field, parent_->usr_smp_ref_name_, parent_->ref_names_, parent_->cnt_names_, samplename
458 all_samplenames.insert( samplename );
463 if( is_ignored_sample_( samplename )) {
465 auto parent = parent_;
469 if( ! parse_if_missing_( parent, it ) ) {
470 utils::parse_unsigned_integer<size_t>( it );
478 auto& sample_info = get_sample_info_( samplename );
479 if( sample_info.has_ref ) {
480 throw std::runtime_error(
481 "Cannot unambiguously parse frequency table header, as it contains multiple columns "
482 "for the reference count of sample \"" + samplename +
"\"."
485 sample_info.has_ref =
true;
494 assert( sample_info.index < std::numeric_limits<size_t>::max() );
495 auto parent = parent_;
496 auto sample_data = sample_data_;
497 auto index = sample_info.index;
499 assert( index < sample_data->size() );
500 if( parse_if_missing_( parent, it ) ) {
501 sample_data->at(index).is_missing =
true;
503 sample_data->at(index).ref_cnt = utils::parse_unsigned_integer<size_t>( it );
515 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_alt_(
516 std::string
const& field,
517 std::unordered_set<std::string>& all_samplenames
526 assert( ! field.empty() );
527 std::string samplename;
528 if( ! match_header_sample_(
529 field, parent_->usr_smp_alt_name_, parent_->alt_names_, parent_->cnt_names_, samplename
533 all_samplenames.insert( samplename );
534 if( is_ignored_sample_( samplename )) {
535 auto parent = parent_;
537 if( ! parse_if_missing_( parent, it ) ) {
538 utils::parse_unsigned_integer<size_t>( it );
543 auto& sample_info = get_sample_info_( samplename );
544 if( sample_info.has_alt ) {
545 throw std::runtime_error(
546 "Cannot unambiguously parse frequency table header, as it contains multiple columns "
547 "for the alternative count of sample \"" + samplename +
"\"."
550 sample_info.has_alt =
true;
551 assert( sample_info.index < std::numeric_limits<size_t>::max() );
552 auto parent = parent_;
553 auto sample_data = sample_data_;
554 auto index = sample_info.index;
556 assert( index < sample_data->size() );
557 if( parse_if_missing_( parent, it ) ) {
558 sample_data->at(index).is_missing =
true;
560 sample_data->at(index).alt_cnt = utils::parse_unsigned_integer<size_t>( it );
570 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_frq_(
571 std::string
const& field,
572 std::unordered_set<std::string>& all_samplenames
578 assert( ! field.empty() );
579 std::string samplename;
580 if( ! match_header_sample_(
581 field, parent_->usr_smp_frq_name_, parent_->frq_names_, samplename
585 all_samplenames.insert( samplename );
586 if( is_ignored_sample_( samplename )) {
587 auto parent = parent_;
589 if( ! parse_if_missing_( parent, it ) ) {
590 utils::parse_float<double>( it );
595 auto& sample_info = get_sample_info_( samplename );
596 if( sample_info.has_frq ) {
597 throw std::runtime_error(
598 "Cannot unambiguously parse frequency table header, as it contains multiple columns "
599 "for the frequency of sample \"" + samplename +
"\"."
602 sample_info.has_frq =
true;
603 assert( sample_info.index < std::numeric_limits<size_t>::max() );
604 auto parent = parent_;
605 auto sample_data = sample_data_;
606 auto index = sample_info.index;
608 assert( index < sample_data->size() );
609 if( parse_if_missing_( parent, it ) ) {
610 sample_data->at(index).is_missing =
true;
612 sample_data->at(index).frq = utils::parse_float<double>( it );
622 int FrequencyTableInputStream::Iterator::evaluate_if_field_is_sample_cov_(
623 std::string
const& field,
624 std::unordered_set<std::string>& all_samplenames
630 assert( ! field.empty() );
631 std::string samplename;
632 if( ! match_header_sample_(
633 field, parent_->usr_smp_cov_name_, parent_->cov_names_, samplename
637 all_samplenames.insert( samplename );
638 if( is_ignored_sample_( samplename )) {
639 auto parent = parent_;
641 if( ! parse_if_missing_( parent, it ) ) {
642 utils::parse_unsigned_integer<size_t>( it );
647 auto& sample_info = get_sample_info_( samplename );
648 if( sample_info.has_cov ) {
649 throw std::runtime_error(
650 "Cannot unambiguously parse frequency table header, as it contains multiple columns "
651 "for the read depth of sample \"" + samplename +
"\"."
654 sample_info.has_cov =
true;
655 assert( sample_info.index < std::numeric_limits<size_t>::max() );
656 auto parent = parent_;
657 auto sample_data = sample_data_;
658 auto index = sample_info.index;
660 assert( index < sample_data->size() );
661 if( parse_if_missing_( parent, it ) ) {
662 sample_data->at(index).is_missing =
true;
664 sample_data->at(index).cov = utils::parse_unsigned_integer<size_t>( it );
674 FrequencyTableInputStream::Iterator::SampleInfo&
675 FrequencyTableInputStream::Iterator::get_sample_info_(
676 std::string
const& samplename
680 if( header_info_.sample_infos.count( samplename ) == 0 ) {
683 auto const next_index = header_info_.sample_infos.size();
684 header_info_.sample_infos[samplename].index = next_index;
689 assert( header_info_.sample_infos.count( samplename ) > 0 );
690 assert( header_info_.sample_infos[ samplename ].index < std::numeric_limits<size_t>::max() );
691 return header_info_.sample_infos[ samplename ];
694 bool FrequencyTableInputStream::Iterator::is_ignored_sample_(
695 std::string
const& samplename
699 if( parent_->sample_names_filter_.empty() ) {
704 auto const found = ( parent_->sample_names_filter_.count( samplename ) > 0 );
705 return !( found ^ parent_->inverse_sample_names_filter_ );
708 bool FrequencyTableInputStream::Iterator::parse_if_missing_(
712 auto const buffer = input_stream.
buffer();
717 auto check_missing_and_skip_ = [&input_stream](
718 char const* lhs,
size_t lhs_len,
char const* rhs,
size_t rhs_len
721 lhs_len >= rhs_len &&
733 if( parent->usr_missing_.empty() ) {
734 for(
auto const& missing_word : parent->missing_ ) {
735 auto const is_missing = check_missing_and_skip_(
736 buffer.first, buffer.second,
737 missing_word.c_str(), missing_word.size()
744 auto const is_missing = check_missing_and_skip_(
745 buffer.first, buffer.second,
746 parent->usr_missing_.c_str(), parent->usr_missing_.size()
759 bool FrequencyTableInputStream::Iterator::match_header_field_(
760 std::string
const& field,
761 std::string
const& user_string,
762 std::vector<std::string>
const& predefined_list
769 assert( ! field.empty() );
770 if( ! user_string.empty() ) {
771 return field == user_string;
776 bool FrequencyTableInputStream::Iterator::match_header_sample_(
777 std::string
const& field,
778 std::string
const& user_substring,
779 std::vector<std::string>
const& predefined_list,
780 std::string& samplename
786 assert( ! field.empty() );
787 if( ! user_substring.empty() ) {
788 return match_header_sample_user_partial_( field, user_substring, samplename );
793 for(
auto const& name : predefined_list ) {
794 if( match_header_sample_predefined_partial_( field, name, samplename )) {
801 bool FrequencyTableInputStream::Iterator::match_header_sample_(
802 std::string
const& field,
803 std::string
const& user_substring,
804 std::vector<std::string>
const& predefined_list1,
805 std::vector<std::string>
const& predefined_list2,
806 std::string& samplename
812 assert( ! field.empty() );
813 if( ! user_substring.empty() ) {
814 return match_header_sample_user_partial_( field, user_substring, samplename );
825 for(
auto const& name1 : predefined_list1 ) {
826 for(
auto const& name2 : predefined_list2 ) {
827 auto name = name1 + name2;
828 if( match_header_sample_predefined_partial_( field, name, samplename )) {
831 name = name2 + name1;
832 if( match_header_sample_predefined_partial_( field, name, samplename )) {
840 bool FrequencyTableInputStream::Iterator::match_header_sample_user_partial_(
841 std::string
const& field,
842 std::string
const& substring,
843 std::string& samplename
850 if(
utils::ends_with( field, substring, samplename ) && ! samplename.empty() ) {
856 bool FrequencyTableInputStream::Iterator::match_header_sample_predefined_partial_(
857 std::string
const& field,
858 std::string
const& substring,
859 std::string& samplename
881 void FrequencyTableInputStream::Iterator::increment_()
884 assert( input_stream_ );
886 auto& it = *input_stream_;
897 assert( sample_data_ );
898 for(
auto& data : *sample_data_ ) {
904 size_t processor_index = 0;
905 while( it && *it !=
'\n' ) {
906 if( processor_index >= column_processors_.size() ) {
907 throw std::runtime_error(
908 "Error while processing frequency table: More columns in line " +
915 column_processors_[processor_index]( it );
918 if( it && ( *it !=
'\n' && *it != parent_->separator_char_ )) {
919 throw std::runtime_error(
920 "Error while processing frequency table: Unexpected char " +
926 assert( !it || ( *it ==
'\n' || *it == parent_->separator_char_ ));
927 if( it && *it == parent_->separator_char_ ) {
934 assert( !it || *it ==
'\n');
940 if( processor_index != column_processors_.size() ) {
941 assert( processor_index < column_processors_.size() );
942 throw std::runtime_error(
943 "Error while processing frequency table: Fewer columns in line " +
949 if( parent_->ref_genome_ ) {
951 assert( current_variant_->chromosome.size() > 0 );
952 assert( current_variant_->position > 0 );
953 auto const ref_gen_base = parent_->ref_genome_->get_base(
954 current_variant_->chromosome, current_variant_->position
958 if( header_info_.has_ref &&
utils::to_upper( current_variant_->reference_base ) !=
'N' ) {
960 auto const ref_base =
utils::to_upper( current_variant_->reference_base );
967 throw std::runtime_error(
968 "At chromosome \"" + current_variant_->chromosome +
"\" position " +
970 ", the provided reference genome has base '" +
971 std::string( 1, ref_gen_base ) +
972 "', while the reference base column in the frequency file is '" +
973 std::string( 1, ref_base ) +
974 "', which is not contained in the referenge genome, " +
975 "and hence likely indicates an issue with the data"
979 assert( header_info_.has_ref || current_variant_->reference_base ==
'N' );
987 current_variant_->reference_base =
'N';
994 assert( header_info_.has_ref || current_variant_->reference_base ==
'N' );
995 assert( header_info_.has_alt || current_variant_->alternative_base ==
'N' );
999 assert( sample_data_ );
1000 assert( current_variant_ );
1001 assert( header_info_.sample_infos.size() == sample_data_->size() );
1002 assert( header_info_.sample_infos.size() == current_variant_->samples.size() );
1007 for(
auto const& sample_info : header_info_.sample_infos ) {
1008 auto const index = sample_info.second.index;
1009 assert( index < sample_data_->size() );
1010 assert( index < current_variant_->samples.size() );
1011 process_sample_data_( sample_info.second, sample_data_->at(index), *current_variant_, index );
1015 current_variant_->status.reset();
1016 size_t missing_count = 0;
1017 for(
auto const& sample : current_variant_->samples ) {
1022 if( missing_count == current_variant_->samples.size() ) {
1031 void FrequencyTableInputStream::Iterator::process_sample_data_(
1032 FrequencyTableInputStream::Iterator::SampleInfo
const& sample_info,
1033 FrequencyTableInputStream::Iterator::SampleData
const& sample_data,
1041 bool do_frq_check =
false;
1044 variant.samples[sample_index] = SampleCounts();
1045 if( sample_data.is_missing ) {
1052 if( sample_info.has_ref && sample_info.has_alt ) {
1055 ref_cnt = sample_data.ref_cnt;
1056 alt_cnt = sample_data.alt_cnt;
1057 do_frq_check =
true;
1061 if( sample_info.has_cov && sample_data.cov != sample_data.ref_cnt + sample_data.alt_cnt ) {
1062 throw std::runtime_error(
1063 "Invalid read depth that is not the sum of the reference and alternative base counts."
1067 }
else if( sample_info.has_ref && sample_info.has_cov ) {
1070 assert( ! sample_info.has_alt );
1073 if( sample_data.cov < sample_data.ref_cnt ) {
1074 throw std::runtime_error(
1075 "Invalid read depth that is smaller than the reference base count."
1080 ref_cnt = sample_data.ref_cnt;
1081 alt_cnt = sample_data.cov - sample_data.ref_cnt;
1082 do_frq_check =
true;
1084 }
else if( sample_info.has_alt && sample_info.has_cov ) {
1088 assert( ! sample_info.has_ref );
1089 throw std::runtime_error(
1090 "Invalid read depth that is smaller than the alternative base count."
1092 ref_cnt = sample_data.cov - sample_data.alt_cnt;
1093 alt_cnt = sample_data.ref_cnt;
1094 do_frq_check =
true;
1096 }
else if( sample_info.has_frq ) {
1100 static_cast<int>( sample_info.has_ref ) +
1101 static_cast<int>( sample_info.has_alt ) +
1102 static_cast<int>( sample_info.has_cov )
1108 auto frq = sample_data.frq;
1111 throw std::runtime_error(
"Invalid frequency < 0.0 in frequency table." );
1117 throw std::runtime_error(
"Invalid frequency > 1.0 in frequency table." );
1121 assert(( !std::isfinite( frq )) ^ ( 0.0 <= frq && frq <= 1.0 ));
1127 if( ! std::isfinite( frq )) {
1131 }
else if( sample_info.has_cov ) {
1134 alt_cnt = sample_data.cov - ref_cnt;
1135 }
else if( sample_info.has_ref ) {
1140 ref_cnt = sample_data.ref_cnt;
1141 auto const ref_dbl =
static_cast<double>( ref_cnt );
1143 }
else if( sample_info.has_alt ) {
1145 alt_cnt = sample_data.alt_cnt;
1146 auto const alt_dbl =
static_cast<double>( alt_cnt );
1151 auto const int_fac = parent_->int_factor_;
1164 if( ! parent_->frequency_is_ref_ ) {
1171 throw std::domain_error(
"Internal error: No valid data type to parse frequency table." );
1175 if( do_frq_check && sample_info.has_frq ) {
1176 auto const ref =
static_cast<double>( ref_cnt );
1177 auto const alt =
static_cast<double>( alt_cnt );
1178 auto const frq = ( parent_->frequency_is_ref_ ? ref : alt ) / ( ref + alt );
1180 throw std::runtime_error(
1181 "Mismatching frequency value ~" +
std::to_string( sample_data.frq ) +
1182 " that has a difference greater than the allowed relative error (" +
1183 std::to_string( parent_->allowed_rel_freq_error_ ) +
") to the frequency " +
1206 assert( sample_index < variant.samples.size() );
1207 assert( ref_base !=
'N' && ref_base !=
'n' );
1208 assert( alt_base !=
'N' && alt_base !=
'n' );
1209 if( ref_base == alt_base ) {
1210 throw std::runtime_error(
1211 "At chromosome \"" + variant.chromosome +
"\" position " +
1213 ": Invalid reference and alternative base that are both '" +
1214 std::string( 1, ref_base ) +
"' in frequency table."
1219 set_base_count( variant.samples[sample_index], ref_base, ref_cnt );
1220 set_base_count( variant.samples[sample_index], alt_base, alt_cnt );