A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
signatures.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <algorithm>
40 #include <unordered_map>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // K-mers
47 // =================================================================================================
48 
49 std::vector<size_t> kmer_counts( Sequence const& seq, size_t k )
50 {
51  // Make a very generous check whether we can actually calculate the wanted power.
52  // We are going to calculate the power 4^k, which is 2^(2k), so test for this limit.
53  // We will almost surely run out of memory for even way smaller k, but at least this is the
54  // upper limit. Also, sutract one last bit, because we need it for programming convenience.
55  // Coule be done in a different way, but the size is big enough anyway.
56  if( k > sizeof(size_t) * 8 / 2 - 1 ) {
57  throw std::invalid_argument( "Cannot store kmers for such large k." );
58  }
59  if( k == 0 ) {
60  throw std::invalid_argument( "Invalid k==0 for kmer list." );
61  }
62 
63  // Result vector. Count the occurance of each possible kmer.
64  auto result = std::vector<size_t>( utils::int_pow( 4, k ), 0 );
65 
66  if( seq.size() < k ) {
67  return result;
68  }
69 
70  auto char_to_num = []( char c ){
71  switch( c ) {
72  case 'a':
73  case 'A':
74  return 0;
75 
76  case 'c':
77  case 'C':
78  return 1;
79 
80  case 'g':
81  case 'G':
82  return 2;
83 
84  case 't':
85  case 'T':
86  return 3;
87 
88  default:
89  throw std::runtime_error( "Sequence contains other chars than 'ACGT'." );
90  }
91  };
92 
93  // Current position in the sequence, and the idx of the count vector for the current kmer.
94  size_t pos = 0;
95  size_t idx = 0;
96 
97  size_t const mask = ( 1 << ( k * 2 )) - 1;
98  // LOG_DBG2 << "mask " << mask;
99 
100  // Init with the first kmer.
101  for( ; pos < k; ++pos ) {
102  idx <<= 2;
103  idx |= char_to_num( seq[pos] );
104 
105  // LOG_DBG2 << seq[pos] << " ("<< char_to_num( seq[pos] ) << ")" << " --> " << idx;
106  }
107  ++result[ idx ];
108 
109  // LOG_DBG2 << "---";
110 
111  // Process the rest of the sequence.
112  for( ; pos < seq.size(); ++pos ) {
113  // LOG_DBG2 << "1 " << idx;
114  idx <<= 2;
115  // LOG_DBG2 << "2 " << idx;
116  idx &= mask;
117  // LOG_DBG2 << "3 " << idx;
118  idx |= char_to_num( seq[pos] );
119  // LOG_DBG2 << "4 " << idx;
120 
121  // LOG_DBG2 << seq[pos] << " ("<< char_to_num( seq[pos] ) << ")" << " --> " << idx;
122  ++result[ idx ];
123  }
124 
125  return result;
126 }
127 
128 std::vector<size_t> kmer_counts( Sequence const& seq, size_t k, std::string const& alphabet )
129 {
130  // Normalize alphabet.
131  auto const w = normalize_codes( alphabet );
132 
133  // Size check.
134  if( w.size() == 0 ) {
135  throw std::invalid_argument( "Invalid alphabet for kmer list." );
136  }
137  if( k == 0 ) {
138  throw std::invalid_argument( "Invalid k==0 for kmer list." );
139  }
140  if( ! utils::is_valid_int_pow( w.size(), k ) ) {
141  throw std::invalid_argument( "Cannot store kmers for such large k." );
142  }
143 
144  // Get the number of entries in the kmer list.
145  size_t const p = utils::int_pow( w.size(), k );
146 
147  // Result vector. Count the occurance of each possible kmer.
148  auto result = std::vector<size_t>( p, 0 );
149 
150  // If the sequence is not long enough and does not contain even one kmer, we are done already.
151  if( seq.size() < k ) {
152  return result;
153  }
154 
155  // Build lookup from sequence chars to index. Use w.size() as invalid char indicator.
156  auto lookup = utils::CharLookup<size_t>( w.size() );
157  for( size_t i = 0; i < w.size(); ++i ) {
158  lookup.set_char_upper_lower( w[i], i );
159  }
160 
161  // Store the index of the count vector for the current kmer,
162  // and the number of valid processed chars of the sequence.
163  size_t index = 0;
164  size_t valids = 0;
165 
166  // Process the sequence.
167  for( size_t pos = 0; pos < seq.size(); ++pos ) {
168  auto const cur = lookup[ seq[pos] ];
169 
170  // Check if the char is valid in the alphabet, and build up the index.
171  if( cur < w.size() ) {
172  index *= w.size();
173  index %= p;
174  index += cur;
175  ++valids;
176 
177  // Only if we already have seen enough valid chars for one k-mer length (or more),
178  // store the kmer.
179  if( valids >= k ) {
180  assert( index < p );
181  ++result[ index ];
182  }
183  }
184  }
185 
186  return result;
187 }
188 
189 std::vector<std::string> kmer_list( size_t k, std::string const& alphabet )
190 {
191  // Normalize alphabet.
192  auto const w = normalize_codes( alphabet );
193 
194  // Size check.
195  if( w.size() == 0 ) {
196  throw std::invalid_argument( "Invalid alphabet for kmer list." );
197  }
198  if( k == 0 ) {
199  throw std::invalid_argument( "Invalid k==0 for kmer list." );
200  }
201  if( ! utils::is_valid_int_pow( w.size(), k ) ) {
202  throw std::invalid_argument( "Cannot store kmers for such large k." );
203  }
204 
205  // Get the number of entries in the kmer list.
206  size_t const p = utils::int_pow( w.size(), k );
207 
208  // Result vector. List all kmers.
209  auto result = std::vector<std::string>( p );
210  for( size_t i = 0; i < p; ++i ) {
211 
212  // Start with a dummy kmer of the correct size and the current index.
213  auto kmer = std::string( k, '#' );
214  assert( kmer.size() == k );
215  size_t c = i;
216 
217  // Fill the kmer from right to left, using conversion of c to base w.size().
218  for( size_t j = 0; j < k; ++j ) {
219  assert(( 1 + j <= k ) && ( k - 1 - j < kmer.size() ));
220 
221  kmer[ k - 1 - j ] = w[ c % w.size() ];
222  c /= w.size();
223  }
224 
225  // In the last loop iteration, we processed all bits of c, so there should be nothing left.
226  assert( c == 0 );
227 
228  result[ i ] = kmer;
229  }
230 
231  return result;
232 }
233 
234 // =================================================================================================
235 // Signatures
236 // =================================================================================================
237 
238 std::string reverse_complement( std::string const& sequence )
239 {
240  auto result = sequence;
241 
242  auto rev_comp = []( char c ){
243  switch( c ) {
244  case 'a':
245  case 'A':
246  return 'T';
247 
248  case 'c':
249  case 'C':
250  return 'G';
251 
252  case 'g':
253  case 'G':
254  return 'C';
255 
256  case 't':
257  case 'T':
258  return 'A';
259 
260  default:
261  throw std::runtime_error( "Sequence contains other chars than 'ACGT'." );
262  }
263  };
264 
265  // Stupid and simple.
266  for( size_t i = 0; i < sequence.size(); ++i ) {
267  result[ sequence.size() - i - 1 ] = rev_comp( sequence[i] );
268  }
269  return result;
270 }
271 
272 std::vector<size_t> kmer_reverse_complement_indices( size_t k )
273 {
274  // Calculate a vector that maps from a kmer index according to kmer_list
275  // to a position in [ 0 , s ), so that rev comp indices map to the same position.
276  auto indices = std::vector<size_t>( utils::int_pow( 4, k ), 0 );
277 
278  // Store a list of kmers and their indices. it only stores one direction - that is,
279  // if we see that the rev comp of a kmer is already in this map, we do not add anything.
280  std::unordered_map<std::string, size_t> done;
281 
282  // Get all kmers as strings.
283  auto const list = kmer_list( k, "ACGT" );
284  assert( indices.size() == list.size() );
285 
286  // This uses string maps and is terribly slow.
287  // But I am really lazy today and the function will not be called often.
288  for( size_t i = 0; i < list.size(); ++i ) {
289  auto const& seq = list[i];
290 
291  // We always encounter the alphabetically first entry fist.
292  // So, we cannot have added its complement before.
293  assert( done.count( seq ) == 0 );
294 
295  // If we have seen the rev comp of the current kmer before, reuse its index.
296  // If not, make a new index, which is one higher than the currently highest index
297  // (by using done.size() to get the number of current entries).
298  auto const rev = reverse_complement( seq );
299  if( done.count( rev ) == 0 ) {
300  auto const ni = done.size();
301  indices[ i ] = ni;
302  done[ seq ] = ni;
303  } else {
304  // We have seen the rev comp before, so we do not update the done list.
305  indices[ i ] = done[ rev ];
306  }
307  }
308 
309  return indices;
310 }
311 
313 {
314  // Calcualtions according to: https://stackoverflow.com/a/40953130
315 
316  // Number of palindromic k-mers. For odd k, there are none, for even k, there are 4^(k/2) = 2^k
317  auto const p = ( k % 2 == 1 ? 0 : utils::int_pow( 2, k ));
318 
319  // Number of entries needed to store rev comp kmers.
320  return p + ( utils::int_pow( 4, k ) - p ) / 2;
321 }
322 
323 std::vector<double> signature_frequencies( Sequence const& seq, size_t k )
324 {
325  // Get kmer counts.
326  auto const kmers = kmer_counts( seq, k );
327  auto const sum = static_cast<double>( std::accumulate( kmers.begin(), kmers.end(), 0 ));
328 
329  // Caluclate their frequencies.
330  auto freqs = std::vector<double>( kmers.size() );
331  for( size_t i = 0; i < kmers.size(); ++i ) {
332  freqs[i] = static_cast<double>( kmers[i] ) / sum;
333  }
334  return freqs;
335 }
336 
337 } // namespace sequence
338 } // namespace genesis
size_t size() const
Alias for length().
Definition: sequence.cpp:112
std::string reverse_complement(std::string const &sequence)
Get the reverse complement of a sequences of ACGT characters.
Definition: signatures.cpp:238
bool is_valid_int_pow(size_t base, size_t exp)
Return whether the given power can be stored within a size_t.
Definition: common.hpp:331
size_t int_pow(size_t base, size_t exp)
Calculate the power base^exp for positive integer values.
Definition: common.hpp:311
std::vector< double > signature_frequencies(Sequence const &seq, size_t k)
Definition: signatures.cpp:323
double sum(const Histogram &h)
std::string normalize_codes(std::string const &alphabet)
Normalize a set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates...
Definition: codes.cpp:345
std::vector< size_t > kmer_reverse_complement_indices(size_t k)
Get a map from indices of kmer_list() and kmer_counts() vectors to a smaller list of size kmer_revers...
Definition: signatures.cpp:272
void set_char_upper_lower(char c, T value)
Set the lookup status for both the upper and lower case of a given char.
Simple lookup table providing a value lookup for each ASCII char (0-127).
Definition: char_lookup.hpp:54
size_t kmer_reverse_complement_size(size_t k)
Get the size needed to store reverse complement kmers.
Definition: signatures.cpp:312
std::vector< size_t > kmer_counts(Sequence const &seq, size_t k)
Count the occurences of k-mers of size k, for nucleic acids "ACGT".
Definition: signatures.cpp:49
std::vector< std::string > kmer_list(size_t k, std::string const &alphabet)
Return the list of all possible k-mers for a given k and alphabet.
Definition: signatures.cpp:189