A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
consensus.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <cmath>
42 #include <stdexcept>
43 #include <vector>
44 
45 namespace genesis {
46 namespace sequence {
47 
48 // =================================================================================================
49 // Helper Template
50 // =================================================================================================
51 
55 using CountPair = std::pair< size_t, char >;
56 
63 bool CountPairComparator ( CountPair const& lhs, CountPair const& rhs )
64 {
65  return ( lhs.first > rhs.first ) || ( lhs.first == rhs.first && lhs.second < rhs.second );
66 }
67 
72 template< typename CharSelector >
74  SequenceCounts const& counts,
75  bool const allow_gaps,
76  CharSelector const& char_selector
77 ) {
78  // Check whether the counts object uses the needed character codes for this function.
79  // The characters in the counts object are sorted, so we can directly compare then like this.
80  if( counts.characters() != "ACGT" ) {
81  throw std::runtime_error(
82  "Computation of this consensus sequence function is only meant "
83  "for nucleic acid codes (ACGT)."
84  );
85  }
86 
87  // Prepare result.
88  std::string result;
89  result.reserve( counts.length() );
90 
91  // Prepare some constants for simplicity.
92  auto const chars = counts.characters();
93  auto const seq_count = counts.added_sequences_count();
94  auto const num_chars = counts.characters().size();
95 
96  // Use a hard coded gap char here, as we have fixed character codes anyway.
97  auto const gap_char = '-';
98 
99  // We expect ACGT here.
100  assert( num_chars == 4 );
101 
102  // Special case: empty counts object. In this case, return an all-gap sequence.
103  if( seq_count == 0 ) {
104  // We do not immediately return the string here, in order to facilitate copy elision.
105  result = std::string( counts.length(), gap_char );
106  return result;
107  }
108 
109  // Process all sites of the sequence.
110  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
111 
112  // Total sum of counts. Used for getting the number of gaps.
113  size_t counts_sum = 0;
114 
115  // Map from counts to positions. We use this for sorting by count. It's a vector, because
116  // it will only have 4 or 5 elements, thus this should be faster than complex containers.
117  std::vector< CountPair > counts_map;
118  counts_map.reserve( 4 );
119 
120  // Add all chars with their counts to the map.
121  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
122  auto const char_count = counts.count_at( char_idx, site_idx );
123  counts_sum += char_count;
124  counts_map.push_back({ char_count, chars[ char_idx ] });
125  }
126 
127  // We can never have a sum of counts higher then the number of sequences that were added
128  // to the counts object.
129  assert( counts_sum <= seq_count );
130 
131  // We already checked that the counts object is for nucleic acids.
132  // Thus, we expect four values here.
133  assert( counts_map.size() == 4 );
134 
135  // If we want to use gaps as a normal character, add their count to the map.
136  // We want to compare the gap count with all other frequencies,
137  // so it makes sense to just treat it as a normal character here.
138  // In the char_selector function, some special care might need to be taken however.
139  if( allow_gaps ) {
140  auto const gap_count = seq_count - counts_sum;
141  counts_sum += gap_count;
142  counts_map.push_back({ gap_count, gap_char });
143 
144  // Now that we added gaps, the total sum matches the number of added sequences.
145  assert( counts_sum == seq_count );
146  }
147 
148  // Sort the counts so that the highest one is first.
149  std::sort( counts_map.begin(), counts_map.end(), CountPairComparator );
150 
151  // Get the ambiguity code that represents the selected characters and add it to the seq.
152  result += char_selector( counts_map, counts_sum );
153  }
154 
155  return result;
156 }
157 
158 // =================================================================================================
159 // Majority
160 // =================================================================================================
161 
163  SequenceCounts const& counts,
164  bool allow_gaps,
165  char gap_char
166 ) {
167  std::string result;
168  result.reserve( counts.length() );
169 
170  // Prepare some constants for simplicity.
171  auto const chars = counts.characters();
172  auto const seq_count = counts.added_sequences_count();
173  auto const num_chars = counts.characters().size();
174 
175  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
176 
177  size_t max_pos = 0;
178  SequenceCounts::CountsIntType max_val = 0;
179  SequenceCounts::CountsIntType counts_sum = 0;
180 
181  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
182  auto const char_count = counts.count_at( char_idx, site_idx );
183  counts_sum += char_count;
184 
185  // We use a strict greater here, as this ensures to use the first character in cases
186  // where many have the same count.
187  if( char_count > max_val ) {
188  max_pos = char_idx;
189  max_val = char_count;
190  }
191  }
192 
193  // We can never have a max higher than the total sum of counts, and this again cannot be
194  // higher then the number of sequences that were added to the counts object.
195  assert( max_val <= counts_sum );
196  assert( counts_sum <= seq_count );
197 
198  // We write a code char if it is the majority, that is, > 0 and > all other code counts.
199  // In other cases, write a gap. That is, either no code has a count > 0, or, if we allow
200  // gaps and gaps are more frquent than actual codes.
201  auto const gap_count = seq_count - counts_sum;
202  if(( max_val > 0 ) && (( ! allow_gaps ) || ( max_val > gap_count ))) {
203  result += chars[ max_pos ];
204  } else {
205  result += gap_char;
206  }
207  }
208 
209  return result;
210 }
211 
213  SequenceSet const& sequences,
214  std::string const& characters,
215  bool allow_gaps,
216  char gap_char
217 ) {
218  // Basic checks.
219  if( sequences.size() == 0 ) {
220  throw std::runtime_error( "Cannot calculate consensus sequence of empty SequenceSet." );
221  }
222  if( ! is_alignment( sequences ) ) {
223  throw std::runtime_error(
224  "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
225  "That is, all Sequences need to have the same length."
226  );
227  }
228 
229  // Build counts object.
230  auto counts = SequenceCounts( characters, sequences[0].size() );
231  counts.add_sequences( sequences );
232 
233  // Return consensus sequence.
234  return consensus_sequence_with_majorities( counts, allow_gaps, gap_char );
235 }
236 
238  SequenceSet const& sequences,
239  bool allow_gaps
240 ) {
242  sequences,
244  allow_gaps,
245  '-'
246  );
247 }
248 
249 // =================================================================================================
250 // Ambiguity
251 // =================================================================================================
252 
254  SequenceCounts const& counts,
255  double similarity_factor,
256  bool allow_gaps
257 ) {
258  // Check the deviation range.
259  if( similarity_factor < 0.0 || similarity_factor > 1.0 ) {
260  throw std::invalid_argument(
261  "Value of similarity_factor has to be in range [ 0.0, 1.0 ]."
262  );
263  }
264 
265  // Use a hard coded gap char here, as we have fixed character codes anyway.
266  auto const gap_char = '-';
267 
268  // Functor that selects the chars according to the consensus specification.
269  auto selector = [&](
270  std::vector<std::pair<size_t, char>> const& counts_map,
271  size_t const counts_sum
272  ){
273  // So that everyone knows what we are dealing with.
274  assert( std::is_sorted( counts_map.begin(), counts_map.end(), CountPairComparator ) );
275 
276  // Special case. All gap site, but allow_gaps == false.
277  // In this case, return a gap instead of an 'N'.
278  if( counts_sum == 0 ) {
279  assert( ! allow_gaps );
280  return gap_char;
281  }
282 
283  // Special case. If gap is the most frequent char, we just return it.
284  if( counts_map[0].second == gap_char ) {
285  return gap_char;
286  }
287 
288  // Prepare a string of characters codes for the ambiguities, init with the most frequent char.
289  auto ambiguity_codes = std::string( 1, counts_map[0].second );
290 
291  // Every character that has at least this count is added to the ambiguity.
292  auto const deviation_threshold = similarity_factor * static_cast<double>( counts_map[0].first );
293 
294  // Compare the less frequent codes to the most frequent one and
295  // decide whether to add them to the ambiguities.
296  for( size_t i = 1; i < counts_map.size(); ++i ) {
297  auto const cur_count = static_cast<double>( counts_map[i].first );
298 
299  // If the count is below the threshold, we are done.
300  // The map is sorted, so no other count will be high enough.
301  // We also avoid zero counts, as this leads to wrong results with an
302  // similarity_factor of 0.0. It would then just add all, ending up with all "N"s,
303  // instead of just all codes that appear in the sequence.
304  if( cur_count < deviation_threshold || cur_count == 0.0 ) {
305  break;
306  }
307 
308  // If it is a gap, we are done - the result is a gap, too.
309  // If not, add it to the ambiguities.
310  if( counts_map[i].second == gap_char ) {
311  return gap_char;
312  } else {
313  ambiguity_codes += counts_map[i].second;
314  }
315  }
316 
317  // Return the ambiguity code that represents the selected characters.
318  return nucleic_acid_ambiguity_code( ambiguity_codes );
319  };
320 
321  return consensus_sequence_template( counts, allow_gaps, selector );
322 }
323 
325  SequenceSet const& sequences,
326  double similarity_factor,
327  bool allow_gaps
328 ) {
329  // Basic checks.
330  if( sequences.size() == 0 ) {
331  throw std::runtime_error( "Cannot calculate consensus sequence of empty SequenceSet." );
332  }
333  if( ! is_alignment( sequences ) ) {
334  throw std::runtime_error(
335  "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
336  "That is, all Sequences need to have the same length."
337  );
338  }
339 
340  // Build counts object.
341  auto counts = SequenceCounts( nucleic_acid_codes_plain(), sequences[0].size() );
342  counts.add_sequences( sequences );
343 
344  // Return consensus sequence.
345  return consensus_sequence_with_ambiguities( counts, similarity_factor, allow_gaps );
346 }
347 
348 // =================================================================================================
349 // Threshold
350 // =================================================================================================
351 
353  SequenceCounts const& counts,
354  double frequency_threshold,
355  bool allow_gaps,
356  bool use_ambiguities
357 ) {
358  // Check the frequency threshold.
359  if( frequency_threshold < 0.0 || frequency_threshold > 1.0 ) {
360  throw std::invalid_argument(
361  "Value of frequency_threshold has to be in range [ 0.0, 1.0 ]."
362  );
363  }
364 
365  // Use hard coded chars here, as we have fixed character codes anyway.
366  auto const gap_char = '-';
367  auto const mask_char = 'X';
368 
369  // Functor that selects the chars according to the consensus specification.
370  auto selector = [&](
371  std::vector<std::pair<size_t, char>> const& counts_map,
372  size_t const counts_sum
373  ){
374  // So that everyone knows what we are dealing with.
375  assert( std::is_sorted( counts_map.begin(), counts_map.end(), CountPairComparator ) );
376 
377  // Special case. All gap site, but allow_gaps == false.
378  // In this case, return a gap instead of an 'N'.
379  if( counts_sum == 0 ) {
380  assert( ! allow_gaps );
381  return gap_char;
382  }
383 
384  // Prepare a string of characters codes for the ambiguities.
385  std::string ambiguity_codes;
386 
387  // Add up the counts and combine ambiguities until we reach the threshold.
388  // If we still do not reach the threshold with all codes, we end up with an N.
389  size_t accumulated_sum = 0;
390  double fraction = 0.0;
391  for( size_t i = 0; i < counts_map.size(); ++i ) {
392 
393  // If there are no counts, we do not use it (and stop here, because in a sorted
394  // counts order, all following counts will be zero anyway). This way, we only use
395  // those codes for the ambiguity that actually appear at the site.
396  if( counts_map[i].first == 0 ) {
397  break;
398  }
399 
400  // If it is a gap, we are done - the result is a gap, too.
401  if( counts_map[i].second == gap_char ) {
402  return gap_char;
403  }
404 
405  // Use this char!
406  accumulated_sum += counts_map[i].first;
407  ambiguity_codes += counts_map[i].second;
408 
409  // Check if we already reached the threshold.
410  // The division is okay, as we already checked that counts_sum > 0 before.
411  fraction = static_cast<double>( accumulated_sum ) / static_cast<double>( counts_sum );
412  if( fraction >= frequency_threshold ) {
413  break;
414  }
415  }
416 
417  // We checked that counts_sum > 0 in the beginning. Thus, at least counts_map needs to
418  // contain non-zero entries. Thus, we added at least one char to ambiguity_codes.
419  assert( ambiguity_codes.size() > 0 );
420 
421  // Finally, return the needed code.
422  if( ambiguity_codes.size() > 1 && ! use_ambiguities ) {
423  return mask_char;
424  } else {
425  return nucleic_acid_ambiguity_code( ambiguity_codes );
426  }
427  };
428 
429  return consensus_sequence_template( counts, allow_gaps, selector );
430 }
431 
433  SequenceSet const& sequences,
434  double frequency_threshold,
435  bool allow_gaps,
436  bool use_ambiguities
437 ) {
438  // Basic checks.
439  if( sequences.size() == 0 ) {
440  throw std::runtime_error( "Cannot calculate consensus sequence of empty SequenceSet." );
441  }
442  if( ! is_alignment( sequences ) ) {
443  throw std::runtime_error(
444  "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
445  "That is, all Sequences need to have the same length."
446  );
447  }
448 
449  // Build counts object.
450  auto counts = SequenceCounts( nucleic_acid_codes_plain(), sequences[0].size() );
451  counts.add_sequences( sequences );
452 
453  // Return consensus sequence.
455  counts,
456  frequency_threshold,
457  allow_gaps,
458  use_ambiguities
459  );
460 }
461 
462 // =================================================================================================
463 // Cavener
464 // =================================================================================================
465 
467  SequenceCounts const& counts,
468  bool allow_gaps
469 ) {
470  // Functor that selects the chars according to the consensus specification.
471  auto selector = [&](
472  std::vector<std::pair<size_t, char>> const& counts_map,
473  size_t const counts_sum
474  ){
475  // So that everyone knows what we are dealing with.
476  assert( std::is_sorted( counts_map.begin(), counts_map.end(), CountPairComparator ) );
477 
478  // Special case. All gap site, but allow_gaps == false.
479  // In this case, return a gap instead of an 'N'.
480  if( counts_sum == 0 ) {
481  assert( ! allow_gaps );
482  return '-';
483  }
484 
485  // Prepare
486  std::string ambiguity_codes;
487 
488  // If the hightest freq is > 50% and > 2 * second highest freq, just use it.
489  if(( 2 * counts_map[0].first > counts_sum ) && ( counts_map[0].first > 2 * counts_map[1].first )) {
490  ambiguity_codes = std::string( 1, counts_map[0].second );
491 
492  // If the first two freqs > 75% (and both < 50%, which was checked above), use dual code.
493  } else if( counts_map[0].first + counts_map[1].first > 3.0 * static_cast<double>( counts_sum ) / 4.0 ) {
494  ambiguity_codes
495  = std::string( 1, counts_map[0].second )
496  + std::string( 1, counts_map[1].second )
497  ;
498 
499  // If neither of the above, but one freq is 0, then use three codes.
500  } else if( counts_map[3].first == 0 ) {
501  ambiguity_codes
502  = std::string( 1, counts_map[0].second )
503  + std::string( 1, counts_map[1].second )
504  + std::string( 1, counts_map[2].second )
505  ;
506 
507  // Fall back case: Use 'N'.
508  } else {
509  ambiguity_codes = "ACGT";
510  }
511 
512  // So far, we have treated gap chars as any other. As gaps are not mentioned in the
513  // original method, this is the best we can do.
514  // So now, if we have a gap in there, we return a gap as the end result,
515  // as combining a gap with anything else results in a gap.
516  std::sort( ambiguity_codes.begin(), ambiguity_codes.end() );
517  if( ambiguity_codes.size() > 0 && ambiguity_codes[0] == '-' ) {
518  return '-';
519  }
520 
521  // Return the ambiguity code that represents the selected characters.
522  return nucleic_acid_ambiguity_code( ambiguity_codes );
523  };
524 
525  return consensus_sequence_template( counts, allow_gaps, selector );
526 }
527 
529  SequenceSet const& sequences,
530  bool allow_gaps
531 ) {
532  // Basic checks.
533  if( sequences.size() == 0 ) {
534  throw std::runtime_error( "Cannot calculate consensus sequence of empty SequenceSet." );
535  }
536  if( ! is_alignment( sequences ) ) {
537  throw std::runtime_error(
538  "Cannot calculate consensus sequence for SequenceSet that is not an alignment. "
539  "That is, all Sequences need to have the same length."
540  );
541  }
542 
543  // Build counts object.
544  auto counts = SequenceCounts( nucleic_acid_codes_plain(), sequences[0].size() );
545  counts.add_sequences( sequences );
546 
547  // Return consensus sequence.
548  return consensus_sequence_cavener( counts, allow_gaps );
549 }
550 
551 } // namespace sequence
552 } // namespace genesis
std::string consensus_sequence_template(SequenceCounts const &counts, bool const allow_gaps, CharSelector const &char_selector)
Local helper function template that handles the common code for the nucleic acid consensus sequence f...
Definition: consensus.cpp:73
Store counts of the occurence for certain characters of Sequences.
Definition: counts.hpp:84
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
std::string consensus_sequence_with_ambiguities(SequenceCounts const &counts, double similarity_factor, bool allow_gaps)
Calculate a consensus sequence by using the most frequent characters at each site, for nucleic acid codes ACGT and their ambiguities.
Definition: consensus.cpp:253
std::string nucleic_acid_codes_plain()
Return all plain nucleic acid codes. Those are "ACGTU".
Definition: codes.cpp:293
std::string consensus_sequence_with_majorities(SequenceCounts const &counts, bool allow_gaps, char gap_char)
Calculate the majority rule consensus sequence by using the most frequent character at each site...
Definition: consensus.cpp:162
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:100
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
std::string consensus_sequence_with_threshold(SequenceCounts const &counts, double frequency_threshold, bool allow_gaps, bool use_ambiguities)
Calculate a consensus sequence where the character frequency needs to be above a given threshold...
Definition: consensus.cpp:352
Store a set of Sequences.
std::string consensus_sequence_cavener(SequenceCounts const &counts, bool allow_gaps)
Calculate a consensus sequence using the method by Cavener for nucleic acid codes ACGT and their ambi...
Definition: consensus.cpp:466
bool is_alignment(SequenceSet const &set)
Return true iff all Sequences in the SequenceSet have the same length.
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
std::pair< size_t, char > CountPair
Helper struct to store characters sorted by their count.
Definition: consensus.cpp:55
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
bool CountPairComparator(CountPair const &lhs, CountPair const &rhs)
Local helper function to sort a CountPair.
Definition: consensus.cpp:63
char nucleic_acid_ambiguity_code(std::string codes)
Return the nucleic acid code that represents all given codes.
Definition: codes.cpp:409