A toolkit for working with phylogenetic data.
v0.20.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
signature_specifications.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
36 #include <algorithm>
37 #include <cassert>
38 #include <limits>
39 #include <numeric>
40 #include <stdexcept>
41 #include <unordered_map>
42 
43 namespace genesis {
44 namespace sequence {
45 
46 // ================================================================================================
47 // Constructors and Rule of Five
48 // ================================================================================================
49 
50 size_t const SignatureSpecifications::InvalidCharIndex = std::numeric_limits<size_t>::max();
51 
52 SignatureSpecifications::SignatureSpecifications( std::string const& alphabet, size_t k )
53 {
54  alphabet_ = normalize_code_alphabet( alphabet );
55  k_ = k;
56  is_nucleic_acids_ = ( alphabet_ == "ACGT" );
57 
58  // Size checks.
59  if( alphabet_.size() == 0 ) {
60  throw std::invalid_argument( "Invalid alphabet for kmer list." );
61  }
62  if( ! utils::is_valid_int_pow( alphabet_.size(), k ) ) {
63  throw std::invalid_argument( "Cannot store kmers for such large k." );
64  }
65  if( k == 0 ) {
66  throw std::invalid_argument( "Invalid k==0 for kmer list." );
67  }
68 
69  // Create lookup from char to its index in the alphabet.
71  for( size_t i = 0; i < alphabet_.size(); ++i ) {
72  index_lookup_.set_char_upper_lower( alphabet_[i], i );
73  }
74 }
75 
76 // ================================================================================================
77 // Derived Properties
78 // ================================================================================================
79 
80 std::vector<std::string> const& SignatureSpecifications::kmer_list() const
81 {
82  // Lazy loading.
83  if( kmer_list_.size() > 0 ) {
84  return kmer_list_;
85  }
86 
87  // Get some settings.
88  auto const& w = alphabet();
89  size_t const p = kmer_list_size();
90 
91  // Result vector. List all kmers.
92  kmer_list_ = std::vector<std::string>( p );
93  for( size_t i = 0; i < p; ++i ) {
94 
95  // Start with a dummy kmer of the correct size and the current index.
96  auto kmer = std::string( k_, '#' );
97  assert( kmer.size() == k_ );
98  size_t c = i;
99 
100  // Fill the kmer from right to left, using conversion of c to base w.size().
101  for( size_t j = 0; j < k_; ++j ) {
102  assert(( 1 + j <= k_ ) && ( k_ - 1 - j < kmer.size() ));
103 
104  kmer[ k_ - 1 - j ] = w[ c % w.size() ];
105  c /= w.size();
106  }
107 
108  // In the last loop iteration, we processed all bits of c, so there should be nothing left.
109  assert( c == 0 );
110 
111  kmer_list_[ i ] = kmer;
112  }
113 
114  return kmer_list_;
115 }
116 
118 {
119  return utils::int_pow( alphabet_.size(), k_ );
120 }
121 
123 {
124  if( ! is_nucleic_acids() ) {
125  throw std::runtime_error( "Reverse complement kmers only valid for nucleic acid codes." );
126  }
127 
128  // Lazy loading.
129  if( rev_comp_map_.size() > 0 ) {
130  return rev_comp_map_;
131  }
132 
133  // Calculate a vector that maps from a kmer index according to kmer_list
134  // to a position in [ 0 , s ), so that rev comp indices map to the same position.
135  rev_comp_map_ = std::vector<size_t>( utils::int_pow( 4, k_ ), 0 );
136 
137  // Store a list of kmers and their indices. it only stores one direction - that is,
138  // if we see that the rev comp of a kmer is already in this map, we do not add anything.
139  std::unordered_map<std::string, size_t> done;
140 
141  // Get all kmers as strings.
142  auto const& list = kmer_list();
143  assert( rev_comp_map_.size() == list.size() );
144 
145  // This uses string maps and is terribly slow.
146  // But I am really lazy today and the function will not be called often.
147  for( size_t i = 0; i < list.size(); ++i ) {
148  auto const& seq = list[i];
149 
150  // We always encounter the alphabetically first entry fist.
151  // So, we cannot have added its complement before.
152  assert( done.count( seq ) == 0 );
153 
154  // If we have seen the rev comp of the current kmer before, reuse its index.
155  // If not, make a new index, which is one higher than the currently highest index
156  // (by using done.size() to get the number of current entries).
157  auto const rev = reverse_complement( seq );
158  if( done.count( rev ) == 0 ) {
159  auto const ni = done.size();
160  rev_comp_map_[ i ] = ni;
161  done[ seq ] = ni;
162  } else {
163  // We have seen the rev comp before, so we do not update the done list.
164  rev_comp_map_[ i ] = done[ rev ];
165  }
166  }
167 
168  // In the end, we have as many entries in done as we have rev comp kmers.
169  assert( done.size() == kmer_reverse_complement_list_size() );
170 
171  return rev_comp_map_;
172 }
173 
175 {
176  if( ! is_nucleic_acids() ) {
177  throw std::runtime_error( "Reverse complement kmers only valid for nucleic acid codes." );
178  }
179 
180  // Lazy loading.
181  if( rev_comp_indices_.size() > 0 ) {
182  return rev_comp_indices_;
183  }
184 
185  // Get all kmers as strings.
186  auto const& list = kmer_list();
187 
188  // Init the result list with max as an indicator that this value has yet to be set.
189  rev_comp_indices_ = std::vector<size_t>( list.size(), std::numeric_limits<size_t>::max() );
190 
191  // Fill the list.
192  for( size_t i = 0; i < list.size(); ++i ) {
193 
194  // We always set two values at a time for speeup, so some can be skipped.
195  if( rev_comp_indices_[i] != std::numeric_limits<size_t>::max() ) {
196  continue;
197  }
198 
199  // Get the current kmer and its rev comp.
200  auto const& seq = list[i];
201  auto const rev = reverse_complement( seq );
202 
203  // If they are the same, the index is also the identity map.
204  // Otherwise, find the rev comp in the list.
205  if( seq == rev ) {
206  rev_comp_indices_[i] = i;
207  } else {
208 
209  // Find the index of the rev compt entry.
210  // It must exist, and is located after the current position i,
211  // because the kmer list is sorted.
212  auto const it = std::find( list.begin() + i, list.end(), rev );
213  assert( it != list.end() );
214  size_t const rci = std::distance( list.begin(), it );
215  assert( rci < rev_comp_indices_.size() );
216 
217  // Set both indicies so that they point to each other.
218  rev_comp_indices_[ i ] = rci;
219  rev_comp_indices_[ rci ] = i;
220  }
221  }
222 
223  return rev_comp_indices_;
224 }
225 
226 std::vector<std::string> const& SignatureSpecifications::kmer_reverse_complement_list() const
227 {
228  if( ! is_nucleic_acids() ) {
229  throw std::runtime_error( "Reverse complement kmers only valid for nucleic acid codes." );
230  }
231 
232  // Lazy loading.
233  if( rev_comp_list_.size() > 0 ) {
234  return rev_comp_list_;
235  }
236 
237  // Some properties.
238  auto const& kl = kmer_list();
239  auto const& rci = kmer_combined_reverse_complement_map();
240  auto const rcls = kmer_reverse_complement_list_size();
241  assert( kl.size() == rci.size() );
242  assert( rcls <= kl.size() );
243 
244  // Init result with empty strings.
245  rev_comp_list_ = std::vector<std::string>( rcls, "" );
246 
247  // Fill the list by iterating all normal kmers.
248  for( size_t i = 0; i < kl.size(); ++i ) {
249 
250  // Get index in rev comp list for the current kmer.
251  auto const idx = rci[i];
252  assert( idx < rev_comp_list_.size() );
253 
254  // If there is not yet an entry, set it.
255  if( rev_comp_list_[ idx ].empty() ) {
256  rev_comp_list_[ idx ] = kl[ i ];
257  }
258  }
259 
260  return rev_comp_list_;
261 }
262 
264 {
265  if( ! is_nucleic_acids() ) {
266  throw std::runtime_error( "Reverse complement kmers only valid for nucleic acid codes." );
267  }
268 
269  // Calcualtions according to: https://stackoverflow.com/a/40953130
270 
271  // Number of palindromic k-mers. For odd k, there are none, for even k, there are 4^(k/2) = 2^k
272  auto const p = ( k_ % 2 == 1 ? 0 : utils::int_pow( 2, k_ ));
273 
274  // Number of entries needed to store rev comp kmers.
275  if( with_palindromes ) {
276  return p + ( utils::int_pow( 4, k_ ) - p ) / 2;
277  } else {
278  return ( utils::int_pow( 4, k_ ) - p ) / 2;
279  }
280 }
281 
282 } // namespace sequence
283 } // namespace genesis
bool is_valid_int_pow(size_t base, size_t exp)
Return whether the given power can be stored within a size_t.
Definition: common.hpp:162
size_t int_pow(size_t base, size_t exp)
Calculate the power base^exp for positive integer values.
Definition: common.hpp:142
std::vector< size_t > const & kmer_reverse_complement_indices() const
Get the indices for each kmer in kmer_list() to its reverse complement in the list.
size_t kmer_reverse_complement_list_size(bool with_palindromes=true) const
bool is_nucleic_acids() const
Speedup and shortcut to test whether the alphabet() is "ACGT".
static size_t const InvalidCharIndex
Value that is used to indicate an invalid (non-alphabet) char when using index_of().
std::string reverse_complement(std::string const &sequence, bool accept_degenerated)
Get the reverse complement of a nucleic acid sequence.
Definition: codes.cpp:487
std::vector< size_t > const & kmer_combined_reverse_complement_map() const
Get a map from indices of kmer_list() and signature_counts() vectors to a smaller list which combines...
std::string normalize_code_alphabet(std::string const &alphabet)
Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.
Definition: codes.cpp:346
std::vector< std::string > const & kmer_reverse_complement_list() const
std::vector< std::string > const & kmer_list() const
Return the list of all possible k-mers for the given k and alphabet.
void set_char_upper_lower(char c, T value)
Set the lookup status for both the upper and lower case of a given char.