A library for working with phylogenetic and population genetic data.
v0.32.0
kmer.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_KMER_KMER_H_
2 #define GENESIS_SEQUENCE_KMER_KMER_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
34 #include <array>
35 #include <cassert>
36 #include <climits>
37 #include <cstdint>
38 #include <stdexcept>
39 #include <string>
40 
41 namespace genesis {
42 namespace sequence {
43 
44 // =================================================================================================
45 // Kmer
46 // =================================================================================================
47 
51 struct KmerTagDefault{};
52 
53 // TODO template params: Tag, Encoding, Storage
54 // encoding is a policy class that does all the: how many bits, which bits stand for what char, which alphabet, how to convert the alpahbet --- all of that!
55 // storage defaults to a simple wrapper around 64bit uint that gives access to consecutive bits according to the encoding bit width.
56 // but also have a std array class that allows to do the same on larger k mers
57 
68 template<typename Tag = KmerTagDefault>
69 class Kmer
70 {
71 public:
72 
73  // -------------------------------------------------------------------------
74  // Typedefs and Enums
75  // -------------------------------------------------------------------------
76 
80  using WordType = uint64_t;
81 
85  static const size_t BIT_SIZE = sizeof(WordType) * CHAR_BIT;
86  static_assert( CHAR_BIT == 8, "CHAR_BIT != 8" );
87 
94  static const uint8_t BITS_PER_CHAR = 2;
95 
99  static const uint8_t MAX_CHARS_PER_KMER = BIT_SIZE / BITS_PER_CHAR;
100  static_assert( BIT_SIZE % BITS_PER_CHAR == 0, "BIT_SIZE % BITS_PER_CHAR != 0" );
101 
102  // -------------------------------------------------------------------------
103  // Constructors and Rule of Five
104  // -------------------------------------------------------------------------
105 
106  // Constructor to initialize k-mer data
107  Kmer( WordType data )
108  : data_( data )
109  {
110  // assert( data & dna_ones_mask_ == data );
111  }
112 
113  // -------------------------------------------------------------------------
114  // K
115  // -------------------------------------------------------------------------
116 
117  static uint8_t k()
118  {
119  return k_;
120  }
121 
129  static void set_k( uint8_t k )
130  {
131  if( k_ != 0 && k != k_ ) {
132  throw std::runtime_error( "Cannot set k for a given Tag multiple times");
133  }
134  reset_k( k );
135  }
136 
145  static void reset_k( uint8_t k )
146  {
147  if( k == 0 || k > MAX_CHARS_PER_KMER ) {
148  throw std::invalid_argument( "Cannot use k-mer with k==" + std::to_string( k ));
149  }
150  k_ = k;
151  }
152 
153  // -------------------------------------------------------------------------
154  // Data
155  // -------------------------------------------------------------------------
156 
157  uint8_t operator[] ( size_t position ) const
158  {
159  assert( k_ > 0 && k_ <= MAX_CHARS_PER_KMER );
160  if( position >= k_ ) {
161  throw std::runtime_error(
162  "Invalid position " + std::to_string( position ) +
163  " in k-mer of size " + std::to_string( k_ )
164  );
165  }
166  return ( data_ & dna_char_mask_[position] ) >> ( position * BITS_PER_CHAR );
167  }
168 
169  WordType const& value() const
170  {
171  return data_;
172  }
173 
175  {
176  return data_;
177  }
178 
179  // -------------------------------------------------------------------------
180  // Internal Data
181  // -------------------------------------------------------------------------
182 
183 private:
184 
185  static const WordType all_0_ = 0;
186  static const WordType all_1_ = (((1ul << 32) - 1) << 32) + ((1ul << 32) - 1);
187 
193  static const std::array<WordType, MAX_CHARS_PER_KMER> dna_char_mask_;
194  // static const std::array<WordType, MAX_CHARS_PER_KMER> dna_char_mask_ =
195  // {{
196  // 1ul << 0 | 1ul << 1, 1ul << 2 | 1ul << 3, 1ul << 4 | 1ul << 5, 1ul << 6 | 1ul << 7,
197  // 1ul << 8 | 1ul << 9, 1ul << 10 | 1ul << 11, 1ul << 12 | 1ul << 13, 1ul << 14 | 1ul << 15,
198  // 1ul << 16 | 1ul << 17, 1ul << 18 | 1ul << 19, 1ul << 20 | 1ul << 21, 1ul << 22 | 1ul << 23,
199  // 1ul << 24 | 1ul << 25, 1ul << 26 | 1ul << 27, 1ul << 28 | 1ul << 29, 1ul << 30 | 1ul << 31,
200  // 1ul << 32 | 1ul << 33, 1ul << 34 | 1ul << 35, 1ul << 36 | 1ul << 37, 1ul << 38 | 1ul << 39,
201  // 1ul << 40 | 1ul << 41, 1ul << 42 | 1ul << 43, 1ul << 44 | 1ul << 45, 1ul << 46 | 1ul << 47,
202  // 1ul << 48 | 1ul << 49, 1ul << 50 | 1ul << 51, 1ul << 52 | 1ul << 53, 1ul << 54 | 1ul << 55,
203  // 1ul << 56 | 1ul << 57, 1ul << 58 | 1ul << 59, 1ul << 60 | 1ul << 61, 1ul << 62 | 1ul << 63
204  // }};
205  // static_assert( dna_char_mask_[MAX_CHARS_PER_KMER-1] != 0, "dna_char_mask_ == 0" );
206 
207  // static const std::array<WordType, MAX_CHARS_PER_KMER> dna_ones_mask_ =
208  // {{
209  // all_0_, all_1_ >> 63, all_1_ >> 62, all_1_ >> 61,
210  // all_1_ >> 60, all_1_ >> 59, all_1_ >> 58, all_1_ >> 57,
211  // all_1_ >> 56, all_1_ >> 55, all_1_ >> 54, all_1_ >> 53,
212  // all_1_ >> 52, all_1_ >> 51, all_1_ >> 50, all_1_ >> 49,
213  // all_1_ >> 48, all_1_ >> 47, all_1_ >> 46, all_1_ >> 45,
214  // all_1_ >> 44, all_1_ >> 43, all_1_ >> 42, all_1_ >> 41,
215  // all_1_ >> 40, all_1_ >> 39, all_1_ >> 38, all_1_ >> 37,
216  // all_1_ >> 36, all_1_ >> 35, all_1_ >> 34, all_1_ >> 33,
217  // all_1_ >> 32, all_1_ >> 31, all_1_ >> 30, all_1_ >> 29,
218  // all_1_ >> 28, all_1_ >> 27, all_1_ >> 26, all_1_ >> 25,
219  // all_1_ >> 24, all_1_ >> 23, all_1_ >> 22, all_1_ >> 21,
220  // all_1_ >> 20, all_1_ >> 19, all_1_ >> 18, all_1_ >> 17,
221  // all_1_ >> 16, all_1_ >> 15, all_1_ >> 14, all_1_ >> 13,
222  // all_1_ >> 12, all_1_ >> 11, all_1_ >> 10, all_1_ >> 9,
223  // all_1_ >> 8, all_1_ >> 7, all_1_ >> 6, all_1_ >> 5,
224  // all_1_ >> 4, all_1_ >> 3, all_1_ >> 2, all_1_ >> 1
225  // }};
226 
227  // -------------------------------------------------------------------------
228  // Private Members
229  // -------------------------------------------------------------------------
230 
231 private:
232 
233  static uint8_t k_;
234  WordType data_;
235 
236 };
237 
238 template<typename Tag>
239 uint8_t Kmer<Tag>::k_ = 0;
240 
241 template<typename Tag>
242 const std::array<typename Kmer<Tag>::WordType, Kmer<Tag>::MAX_CHARS_PER_KMER> Kmer<Tag>::dna_char_mask_ =
243 {{
244  1ul << 0 | 1ul << 1, 1ul << 2 | 1ul << 3, 1ul << 4 | 1ul << 5, 1ul << 6 | 1ul << 7,
245  1ul << 8 | 1ul << 9, 1ul << 10 | 1ul << 11, 1ul << 12 | 1ul << 13, 1ul << 14 | 1ul << 15,
246  1ul << 16 | 1ul << 17, 1ul << 18 | 1ul << 19, 1ul << 20 | 1ul << 21, 1ul << 22 | 1ul << 23,
247  1ul << 24 | 1ul << 25, 1ul << 26 | 1ul << 27, 1ul << 28 | 1ul << 29, 1ul << 30 | 1ul << 31,
248  1ul << 32 | 1ul << 33, 1ul << 34 | 1ul << 35, 1ul << 36 | 1ul << 37, 1ul << 38 | 1ul << 39,
249  1ul << 40 | 1ul << 41, 1ul << 42 | 1ul << 43, 1ul << 44 | 1ul << 45, 1ul << 46 | 1ul << 47,
250  1ul << 48 | 1ul << 49, 1ul << 50 | 1ul << 51, 1ul << 52 | 1ul << 53, 1ul << 54 | 1ul << 55,
251  1ul << 56 | 1ul << 57, 1ul << 58 | 1ul << 59, 1ul << 60 | 1ul << 61, 1ul << 62 | 1ul << 63
252 }};
253 
254 } // namespace sequence
255 } // namespace genesis
256 
257 #endif // include guard
genesis::sequence::KmerTagDefault
Default Tag for a Kmer, used when no other Tag is provided.
Definition: kmer.hpp:51
genesis::sequence::Kmer::value
WordType & value()
Definition: kmer.hpp:174
genesis::sequence::Kmer
Kmer class template for representing k-mers of various sizes, currently up to k-32.
Definition: kmer.hpp:69
genesis::sequence::Kmer::BIT_SIZE
static const size_t BIT_SIZE
Number of bits in the underlying integer type used to store the k-mer.
Definition: kmer.hpp:85
genesis::sequence::Kmer::MAX_CHARS_PER_KMER
static const uint8_t MAX_CHARS_PER_KMER
Definition: kmer.hpp:99
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::sequence::Kmer::k
static uint8_t k()
Definition: kmer.hpp:117
genesis::sequence::Kmer::BITS_PER_CHAR
static const uint8_t BITS_PER_CHAR
Number of bits needed to store a character of input data.
Definition: kmer.hpp:94
genesis::sequence::Kmer::Kmer
Kmer(WordType data)
Definition: kmer.hpp:107
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::Kmer::reset_k
static void reset_k(uint8_t k)
Re-set the value of k for all Kmers of the given Tag.
Definition: kmer.hpp:145
genesis::sequence::Kmer::operator[]
uint8_t operator[](size_t position) const
Definition: kmer.hpp:157
genesis::sequence::Kmer::WordType
uint64_t WordType
Underlying integer type used to store the k-mer.
Definition: kmer.hpp:80
genesis::sequence::Kmer::set_k
static void set_k(uint8_t k)
Set the value of k for all Kmers of the given Tag.
Definition: kmer.hpp:129
genesis::sequence::Kmer::value
WordType const & value() const
Definition: kmer.hpp:169