65 return ( lhs.first > rhs.first ) || ( lhs.first == rhs.first && lhs.second < rhs.second );
72 template<
typename CharSelector >
75 bool const allow_gaps,
76 CharSelector
const& char_selector
81 throw std::runtime_error(
82 "Computation of this consensus sequence function is only meant "
83 "for nucleic acid codes (ACGT)."
89 result.reserve( counts.
length() );
94 auto const num_chars = counts.
characters().size();
97 auto const gap_char =
'-';
100 assert( num_chars == 4 );
103 if( seq_count == 0 ) {
105 result = std::string( counts.
length(), gap_char );
110 for(
size_t site_idx = 0; site_idx < counts.
length(); ++site_idx ) {
113 size_t counts_sum = 0;
117 std::vector< CountPair > counts_map;
118 counts_map.reserve( 4 );
121 for(
size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
122 auto const char_count = counts.
count_at( char_idx, site_idx );
123 counts_sum += char_count;
124 counts_map.push_back({ char_count, chars[ char_idx ] });
129 assert( counts_sum <= seq_count );
133 assert( counts_map.size() == 4 );
140 auto const gap_count = seq_count - counts_sum;
141 counts_sum += gap_count;
142 counts_map.push_back({ gap_count, gap_char });
145 assert( counts_sum == seq_count );
152 result += char_selector( counts_map, counts_sum );
168 result.reserve( counts.
length() );
173 auto const num_chars = counts.
characters().size();
175 for(
size_t site_idx = 0; site_idx < counts.
length(); ++site_idx ) {
181 for(
size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
182 auto const char_count = counts.
count_at( char_idx, site_idx );
183 counts_sum += char_count;
187 if( char_count > max_val ) {
189 max_val = char_count;
195 assert( max_val <= counts_sum );
196 assert( counts_sum <= seq_count );
201 auto const gap_count = seq_count - counts_sum;
202 if(( max_val > 0 ) && (( ! allow_gaps ) || ( max_val > gap_count ))) {
203 result += chars[ max_pos ];
214 std::string
const& characters,
219 if( sequences.
size() == 0 ) {
220 throw std::runtime_error(
"Cannot calculate consensus sequence of empty SequenceSet." );
223 throw std::runtime_error(
224 "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
225 "That is, all Sequences need to have the same length."
230 auto counts =
SiteCounts( characters, sequences[0].size() );
231 counts.add_sequences( sequences );
255 double similarity_factor,
259 if( similarity_factor < 0.0 || similarity_factor > 1.0 ) {
260 throw std::invalid_argument(
261 "Value of similarity_factor has to be in range [ 0.0, 1.0 ]."
266 auto const gap_char =
'-';
270 std::vector<std::pair<size_t, char>>
const& counts_map,
271 size_t const counts_sum
278 if( counts_sum == 0 ) {
279 assert( ! allow_gaps );
284 if( counts_map[0].second == gap_char ) {
289 auto ambiguity_codes = std::string( 1, counts_map[0].second );
292 auto const deviation_threshold = similarity_factor *
static_cast<double>( counts_map[0].first );
296 for(
size_t i = 1; i < counts_map.size(); ++i ) {
297 auto const cur_count =
static_cast<double>( counts_map[i].first );
304 if( cur_count < deviation_threshold || cur_count == 0.0 ) {
310 if( counts_map[i].second == gap_char ) {
313 ambiguity_codes += counts_map[i].second;
326 double similarity_factor,
330 if( sequences.
size() == 0 ) {
331 throw std::runtime_error(
"Cannot calculate consensus sequence of empty SequenceSet." );
334 throw std::runtime_error(
335 "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
336 "That is, all Sequences need to have the same length."
342 counts.add_sequences( sequences );
354 double frequency_threshold,
359 if( frequency_threshold < 0.0 || frequency_threshold > 1.0 ) {
360 throw std::invalid_argument(
361 "Value of frequency_threshold has to be in range [ 0.0, 1.0 ]."
366 auto const gap_char =
'-';
367 auto const mask_char =
'X';
371 std::vector<std::pair<size_t, char>>
const& counts_map,
372 size_t const counts_sum
379 if( counts_sum == 0 ) {
380 assert( ! allow_gaps );
385 std::string ambiguity_codes;
389 size_t accumulated_sum = 0;
390 double fraction = 0.0;
391 for(
size_t i = 0; i < counts_map.size(); ++i ) {
396 if( counts_map[i].first == 0 ) {
401 if( counts_map[i].second == gap_char ) {
406 accumulated_sum += counts_map[i].first;
407 ambiguity_codes += counts_map[i].second;
411 fraction =
static_cast<double>( accumulated_sum ) /
static_cast<double>( counts_sum );
412 if( fraction >= frequency_threshold ) {
419 assert( ambiguity_codes.size() > 0 );
422 if( ambiguity_codes.size() > 1 && ! use_ambiguities ) {
434 double frequency_threshold,
439 if( sequences.
size() == 0 ) {
440 throw std::runtime_error(
"Cannot calculate consensus sequence of empty SequenceSet." );
443 throw std::runtime_error(
444 "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
445 "That is, all Sequences need to have the same length."
451 counts.add_sequences( sequences );
472 std::vector<std::pair<size_t, char>>
const& counts_map,
473 size_t const counts_sum
480 if( counts_sum == 0 ) {
481 assert( ! allow_gaps );
486 std::string ambiguity_codes;
489 if(( 2 * counts_map[0].first > counts_sum ) && ( counts_map[0].first > 2 * counts_map[1].first )) {
490 ambiguity_codes = std::string( 1, counts_map[0].second );
493 }
else if( counts_map[0].first + counts_map[1].first > 3.0 *
static_cast<double>( counts_sum ) / 4.0 ) {
495 = std::string( 1, counts_map[0].second )
496 + std::string( 1, counts_map[1].second )
500 }
else if( counts_map[3].first == 0 ) {
502 = std::string( 1, counts_map[0].second )
503 + std::string( 1, counts_map[1].second )
504 + std::string( 1, counts_map[2].second )
509 ambiguity_codes =
"ACGT";
516 std::sort( ambiguity_codes.begin(), ambiguity_codes.end() );
517 if( ambiguity_codes.size() > 0 && ambiguity_codes[0] ==
'-' ) {
533 if( sequences.
size() == 0 ) {
534 throw std::runtime_error(
"Cannot calculate consensus sequence of empty SequenceSet." );
537 throw std::runtime_error(
538 "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
539 "That is, all Sequences need to have the same length."
545 counts.add_sequences( sequences );