A library for working with phylogenetic and population genetic data.
v0.32.0
signatures.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
40 
41 #include <algorithm>
42 #include <cassert>
43 #include <limits>
44 #include <numeric>
45 #include <ostream>
46 #include <sstream>
47 #include <stdexcept>
48 #include <unordered_map>
49 
50 namespace genesis {
51 namespace sequence {
52 
53 // =================================================================================================
54 // Signature Counts and Frequencies
55 // =================================================================================================
56 
57 std::vector<size_t> signature_counts(
58  Sequence const& sequence,
59  SignatureSpecifications const& settings
60 ) {
61  // Get alphabet.
62  auto const& w = settings.alphabet();
63  auto const ws = w.size();
64 
65  // Get the number of entries in the kmer list.
66  size_t const p = settings.kmer_list_size();
67 
68  // Result vector. Count the occurance of each possible kmer.
69  auto result = std::vector<size_t>( p, 0 );
70 
71  // If the sequence is not long enough and does not contain even one kmer, we are done already.
72  if( sequence.size() < settings.k() ) {
73  return result;
74  }
75 
76  // Store the index of the count vector for the current kmer,
77  // and the number of valid processed chars of the sequence.
78  size_t index = 0;
79  size_t valids = 0;
80 
81  // Process the sequence.
82  for( size_t pos = 0; pos < sequence.size(); ++pos ) {
83  auto const cur = settings.char_index( sequence[pos] );
84 
85  // Check if the char is valid in the alphabet
86  if( cur == settings.InvalidCharIndex ) {
87  switch( settings.unknown_char_behavior() ) {
89  continue;
91  throw std::runtime_error(
92  "Unknown Sequence char for kmer counting: '" +
93  std::string( 1, sequence[pos] ) + "'"
94  );
95  default:
96  assert( false );
97  }
98  }
99 
100  // Build up the index.
101  index *= ws;
102  index %= p;
103  index += cur;
104  ++valids;
105 
106  // Only if we already have seen enough valid chars for one k-mer length (or more),
107  // store the kmer.
108  if( valids >= settings.k() ) {
109  assert( index < result.size() );
110  ++result[ index ];
111  }
112  }
113 
114  return result;
115 }
116 
117 std::vector<double> signature_frequencies(
118  Sequence const& sequence,
119  SignatureSpecifications const& settings
120 ) {
121  // Get kmer counts. We need to do a full accumulation instead of using the sequence length,
122  // because unknown chars might have been skipped while counting.
123  auto const kmers = signature_counts( sequence, settings );
124  auto const sum = static_cast<double>( std::accumulate( kmers.begin(), kmers.end(), 0 ));
125 
126  // Caluclate the frequencies.
127  auto freqs = std::vector<double>( kmers.size() );
128  for( size_t i = 0; i < kmers.size(); ++i ) {
129  freqs[i] = static_cast<double>( kmers[i] ) / sum;
130  }
131  return freqs;
132 }
133 
134 // =================================================================================================
135 // Signature Symmetrized
136 // =================================================================================================
137 
141 template<class T>
143  std::vector<T> const& kmers,
144  SignatureSpecifications const& settings
145 ) {
146  auto const& map = settings.kmer_combined_reverse_complement_map();
147  assert( kmers.size() == map.size() );
148 
149  auto const size = settings.kmer_reverse_complement_list_size();
150  auto result = std::vector<T>( size, {} );
151 
152  for( size_t i = 0; i < kmers.size(); ++i ) {
153  assert( map[i] < result.size() );
154  result[ map[i] ] += kmers[i];
155  }
156  return result;
157 }
158 
159 std::vector<size_t> signature_symmetrized_counts(
160  Sequence const& sequence,
161  SignatureSpecifications const& settings
162 ) {
164  signature_counts( sequence, settings ),
165  settings
166  );
167 }
168 
170  Sequence const& sequence,
171  SignatureSpecifications const& settings
172 ) {
174  signature_frequencies( sequence, settings ),
175  settings
176  );
177 }
178 
179 // =================================================================================================
180 // Signature Ranks
181 // =================================================================================================
182 
183 std::vector<size_t> signature_ranks(
184  Sequence const& sequence,
185  SignatureSpecifications const& settings
186 ) {
187  // We use frequencies, because the ranking expects a vec of double,
188  // and not a vec of size_t as returned by the counts signature...
189  return utils::ranking_standard( signature_frequencies( sequence, settings ) );
190 }
191 
192 std::vector<size_t> signature_symmetrized_ranks(
193  Sequence const& sequence,
194  SignatureSpecifications const& settings
195 ) {
196  // We use frequencies, because the ranking expects a vec of double,
197  // and not a vec of size_t as returned by the counts signature...
198  return utils::ranking_standard( signature_symmetrized_frequencies( sequence, settings ) );
199 }
200 
201 // =================================================================================================
202 // Signature Min Max Reverse Complements
203 // =================================================================================================
204 
209 template<class Combinator>
211  Sequence const& sequence,
212  SignatureSpecifications const& settings,
213  Combinator combinator
214 ) {
215  // Get indices first, so that the function fails early if we do not have nucleic acids.
216  // Also, get size of the resulting list, and init the list.
217  auto const& indices = settings.kmer_reverse_complement_indices();
218  // auto const size = settings.kmer_reverse_complement_list_size( false );
219  // auto result = std::vector<double>( size, init );
220  auto result = std::vector<double>();
221 
222  // Calculate freqs.
223  auto const& freqs = signature_frequencies( sequence, settings );
224  assert( freqs.size() == indices.size() );
225  for( size_t i = 0; i < freqs.size(); ++i ) {
226 
227  // Skip palindromes.
228  if( indices[i] == i ) {
229  continue;
230  }
231 
232  // Skip the second kmer. We do the comparison for the first one already.
233  if( indices[i] < i ) {
234  continue;
235  }
236 
237  // Find the min or max of the two complementary kmers and store it.
238  auto const combined = combinator( freqs[i], freqs[ indices[i] ] );
239  result.push_back( combined );
240  }
241  return result;
242 }
243 
245  Sequence const& sequence,
246  SignatureSpecifications const& settings
247 ) {
249  sequence,
250  settings,
251  []( double ff, double fc ){
252  return std::min( ff, fc );
253  }
254  );
255 }
256 
258  Sequence const& sequence,
259  SignatureSpecifications const& settings
260 ) {
262  sequence,
263  settings,
264  []( double ff, double fc ){
265  return std::max( ff, fc );
266  }
267  );
268 }
269 
271  Sequence const& sequence,
272  SignatureSpecifications const& settings
273 ){
274  // Get indices first, so that the function fails early if we do not have nucleic acids.
275  auto const& indices = settings.kmer_reverse_complement_indices();
276  auto result = std::vector<double>();
277 
278  // Calculate freqs.
279  auto const& freqs = signature_frequencies( sequence, settings );
280  assert( freqs.size() == indices.size() );
281  for( size_t i = 0; i < freqs.size(); ++i ) {
282 
283  // Only use palindromes, i.e., the ones where the reverse complement is the identity.
284  if( indices[i] != i ) {
285  continue;
286  }
287  result.push_back( freqs[i] );
288  }
289  return result;
290 }
291 
292 // =================================================================================================
293 // Signature Misc
294 // =================================================================================================
295 
296 std::vector<double> signature_frequency_ratios_1(
297  Sequence const& sequence,
298  SignatureSpecifications const& settings
299 ) {
301  sequence,
302  settings,
303  []( double ff, double fc ){
304  assert( ff >= 0.0 && fc >= 0.0 );
305  if( ff == 0.0 && fc == 0.0 ) {
306  return 1.0;
307  } else if( ff == 0.0 || fc == 0.0 ) {
308  return 0.0;
309  } else {
310  return std::min( ff / fc, fc / ff );
311  }
312  }
313  );
314 }
315 
316 std::vector<double> signature_frequency_ratios_2(
317  Sequence const& sequence,
318  SignatureSpecifications const& settings
319 ) {
321  sequence,
322  settings,
323  []( double ff, double fc ){
324  assert( ff >= 0.0 && fc >= 0.0 );
325  if( ff == 0.0 && fc == 0.0 ) {
326  return 0.5;
327  } else {
328  return std::min( ff, fc ) / ( ff + fc );
329  }
330  }
331  );
332 }
333 
334 std::vector<double> signature_jensen_shannon(
335  Sequence const& sequence,
336  SignatureSpecifications const& settings
337 ) {
339  sequence,
340  settings,
341  []( double ff, double fc ){
342  assert( ff >= 0.0 && fc >= 0.0 );
343  auto const s1 = ff * log( ( 2 * ff ) / ( ff + fc ) );
344  auto const s2 = fc * log( ( 2 * fc ) / ( ff + fc ) );
345  return s1 + s2;
346  }
347  );
348 }
349 
350 // =================================================================================================
351 // Kmer Strings
352 // =================================================================================================
353 
358  Sequence const& sequence,
359  SignatureSpecifications const& settings,
360  size_t start,
361  std::ostream& out
362 ) {
363  for( size_t i = start; i < start + settings.k(); ++i ) {
364 
365  // Check if the char is valid in the alphabet
366  if( settings.char_index( sequence[ i ] ) == settings.InvalidCharIndex ) {
367  switch( settings.unknown_char_behavior() ) {
369  continue;
371  throw std::runtime_error(
372  "Unknown Sequence char for kmer string: '" +
373  std::string( 1, sequence[ i ] ) + "'"
374  );
375  default:
376  assert( false );
377  }
378  }
379 
380  out << sequence[ i ];
381  }
382 }
383 
388  Sequence const& sequence,
389  SignatureSpecifications const& settings,
390  std::ostream& out
391 ) {
392  auto const k = settings.k();
393 
394  // Not even one k-mer: Do nothing.
395  // We need this check, otherwise the subtraction of sizes later on gives an overflow.
396  if( sequence.size() < k ) {
397  return;
398  }
399 
400  for( size_t i = 0; i < sequence.size() - k + 1; ++i ) {
401  if( i > 0 ) {
402  out << " ";
403  }
404  kmer_string_single_kmer( sequence, settings, i, out );
405  }
406 }
407 
412  Sequence const& sequence,
413  SignatureSpecifications const& settings,
414  std::ostream& out,
415  size_t offset
416 ) {
417  auto const k = settings.k();
418  for( size_t i = offset; i + k - 1 < sequence.size(); i += k ) {
419  if( i > offset ) {
420  out << " ";
421  }
422  kmer_string_single_kmer( sequence, settings, i, out );
423  }
424 }
425 
427  Sequence const& sequence,
428  SignatureSpecifications const& settings
429 ) {
430  std::ostringstream out;
431  kmer_string_overlapping_line( sequence, settings, out );
432  return out.str();
433 }
434 
436  Sequence const& sequence,
437  SignatureSpecifications const& settings,
438  std::ostream& out
439 ) {
440  if( sequence.size() < settings.k() ) {
441  return;
442  }
443 
444  kmer_string_overlapping_line( sequence, settings, out );
445  out << "\n";
446 }
447 
448 std::vector<std::string> kmer_strings_non_overlapping(
449  Sequence const& sequence,
450  SignatureSpecifications const& settings
451 ) {
452  std::vector<std::string> result;
453  for( size_t offset = 0; offset < settings.k(); ++offset ) {
454 
455  // Stop if we do not have enough sequence length left for this offset.
456  // This only occurs if the sequence is short.
457  if( offset + settings.k() > sequence.size() ) {
458  break;
459  }
460 
461  std::ostringstream out;
462  kmer_strings_non_overlapping_line( sequence, settings, out, offset );
463  result.push_back( out.str() );
464  }
465  return result;
466 }
467 
469  Sequence const& sequence,
470  SignatureSpecifications const& settings,
471  std::ostream& out
472 ) {
473  for( size_t offset = 0; offset < settings.k(); ++offset ) {
474 
475  // Stop if we do not have enough sequence length left for this offset.
476  // This only occurs if the sequence is short.
477  if( offset + settings.k() > sequence.size() ) {
478  break;
479  }
480 
481  kmer_strings_non_overlapping_line( sequence, settings, out, offset );
482  out << "\n";
483  }
484 }
485 
486 } // namespace sequence
487 } // namespace genesis
genesis::sequence::SignatureSpecifications::kmer_list_size
size_t kmer_list_size() const
Definition: signature_specifications.cpp:117
genesis::sequence::kmer_string_overlapping
std::string kmer_string_overlapping(Sequence const &sequence, SignatureSpecifications const &settings)
Return the sequence spitted into overlapping k-mers.
Definition: signatures.cpp:426
genesis::sequence::signature_reverse_identity_frequencies
std::vector< double > signature_reverse_identity_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the signature of a sequence that uses only the frequencies of k-mers whose reverse compleme...
Definition: signatures.cpp:270
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
stats.hpp
signatures.hpp
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::signature_symmetrized_ranks
std::vector< size_t > signature_symmetrized_ranks(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the symmetrized rank signature of a sequence according to the settings.
Definition: signatures.cpp:192
genesis::sequence::SignatureSpecifications::InvalidCharIndex
static const size_t InvalidCharIndex
Value that is used to indicate an invalid (non-alphabet) char when using index_of().
Definition: signature_specifications.hpp:92
genesis::sequence::signature_symmetrized_counts
std::vector< size_t > signature_symmetrized_counts(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the symmetrized counts of the sequence according to the settings.
Definition: signatures.cpp:159
genesis::sequence::signature_complementarity_frequencies_helper
std::vector< double > signature_complementarity_frequencies_helper(Sequence const &sequence, SignatureSpecifications const &settings, Combinator combinator)
Local helper function that returns a vector where the frequencies of the non-palindromic kmers are co...
Definition: signatures.cpp:210
genesis::sequence::signature_ranks
std::vector< size_t > signature_ranks(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the rank signature of a sequence according to the settings.
Definition: signatures.cpp:183
genesis::sequence::signature_maximal_complementarity_frequencies
std::vector< double > signature_maximal_complementarity_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the signature of a sequence that uses the maximum frequency of reverse complement k-mers.
Definition: signatures.cpp:257
sequence_set.hpp
genesis::utils::offset
void offset(Histogram &h, double value)
Definition: operations.cpp:47
signature_specifications.hpp
genesis::sequence::kmer_string_overlapping_line
static void kmer_string_overlapping_line(Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
Local helper function that writes an overlapping kmer string to a stream.
Definition: signatures.cpp:387
genesis::sequence::signature_frequency_ratios_1
std::vector< double > signature_frequency_ratios_1(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the ratio 1 signature of a sequence.
Definition: signatures.cpp:296
genesis::sequence::SignatureSpecifications::unknown_char_behavior
UnknownCharBehavior unknown_char_behavior() const
Definition: signature_specifications.hpp:140
genesis::sequence::SignatureSpecifications::kmer_reverse_complement_list_size
size_t kmer_reverse_complement_list_size(bool with_palindromes=true) const
Definition: signature_specifications.cpp:263
char_lookup.hpp
genesis::sequence::SignatureSpecifications::k
size_t k() const
Definition: signature_specifications.hpp:135
genesis::sequence::signature_frequency_ratios_2
std::vector< double > signature_frequency_ratios_2(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the ratio 2 signature of a sequence.
Definition: signatures.cpp:316
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SignatureSpecifications::UnknownCharBehavior::kThrow
@ kThrow
Throw an exception.
statistics.hpp
genesis::sequence::signature_counts
std::vector< size_t > signature_counts(Sequence const &sequence, SignatureSpecifications const &settings)
Count the occurences of k-mers in the sequence according to the settings.
Definition: signatures.cpp:57
genesis::sequence::SignatureSpecifications::alphabet
std::string const & alphabet() const
Definition: signature_specifications.hpp:130
genesis::sequence::signature_symmetrized_helper
std::vector< T > signature_symmetrized_helper(std::vector< T > const &kmers, SignatureSpecifications const &settings)
Local helper function that adds up the values for reverse complement k-mers.
Definition: signatures.cpp:142
genesis::sequence::Sequence::size
size_t size() const
Alias for length().
Definition: sequence/sequence.hpp:175
genesis::sequence::signature_jensen_shannon
std::vector< double > signature_jensen_shannon(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the Jensen-Shannon (JS) signature of a sequence.
Definition: signatures.cpp:334
genesis::sequence::kmer_string_single_kmer
static void kmer_string_single_kmer(Sequence const &sequence, SignatureSpecifications const &settings, size_t start, std::ostream &out)
Local helper function that writes one kmer string to a stream.
Definition: signatures.cpp:357
genesis::sequence::kmer_strings_non_overlapping
std::vector< std::string > kmer_strings_non_overlapping(Sequence const &sequence, SignatureSpecifications const &settings)
Return the sequence spitted into a set of non-overlapping k-mers.
Definition: signatures.cpp:448
genesis::sequence::signature_minimal_complementarity_frequencies
std::vector< double > signature_minimal_complementarity_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the signature of a sequence that uses the minimum frequency of reverse complement k-mers.
Definition: signatures.cpp:244
genesis::sequence::kmer_strings_non_overlapping_line
static void kmer_strings_non_overlapping_line(Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out, size_t offset)
Local helper function that does one line of a non overlapping kmer string.
Definition: signatures.cpp:411
genesis::sequence::SignatureSpecifications::kmer_reverse_complement_indices
std::vector< size_t > const & kmer_reverse_complement_indices() const
Get the indices for each kmer in kmer_list() to its reverse complement in the list.
Definition: signature_specifications.cpp:174
genesis::utils::ranking_standard
std::vector< size_t > ranking_standard(RandomAccessIterator first, RandomAccessIterator last)
Return the ranking of the values in the given range, using Standard competition ranking ("1224" ranki...
Definition: ranking.hpp:61
genesis::sequence::SignatureSpecifications::char_index
size_t char_index(char c) const
Return the index of a char within the alphabet().
Definition: signature_specifications.hpp:163
genesis::sequence::SignatureSpecifications
Specifications for calculating signatures (like k-mer counts) from Sequences.
Definition: signature_specifications.hpp:64
genesis::sequence::SignatureSpecifications::UnknownCharBehavior::kSkip
@ kSkip
Simply ignore the char by skipping it.
sequence.hpp
codes.hpp
genesis::sequence::signature_frequencies
std::vector< double > signature_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the frequencies of occurences of k-mers in the sequence according to the settings.
Definition: signatures.cpp:117
genesis::sequence::signature_symmetrized_frequencies
std::vector< double > signature_symmetrized_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the symmetrized counts of the sequence according to the settings.
Definition: signatures.cpp:169
genesis::sequence::SignatureSpecifications::kmer_combined_reverse_complement_map
std::vector< size_t > const & kmer_combined_reverse_complement_map() const
Get a map from indices of kmer_list() and signature_counts() vectors to a smaller list which combines...
Definition: signature_specifications.cpp:122