A library for working with phylogenetic and population genetic data.
v0.32.0
function.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_KMER_FUNCTION_H_
2 #define GENESIS_SEQUENCE_KMER_FUNCTION_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
35 
36 #include <cassert>
37 #include <cstdint>
38 #include <iostream>
39 #include <stdexcept>
40 #include <string>
41 #include <type_traits>
42 
43 namespace genesis {
44 namespace sequence {
45 
46 // =================================================================================================
47 // Helper Functions
48 // =================================================================================================
49 
56 inline size_t number_of_kmers( size_t k, size_t alphabet_size = 4 )
57 {
58  // We do an explicit loop instead of using std::pow,
59  // in case that double precision is not enough here.
60  size_t n = 1;
61  for( size_t i = 0; i < k; ++i ) {
62  n *= alphabet_size;
63  }
64  return n;
65 }
66 
67 // =================================================================================================
68 // Kmer Operators
69 // =================================================================================================
70 
71 template<typename Tag>
72 bool operator==( Kmer<Tag> const& lhs, Kmer<Tag> const& rhs )
73 {
74  return lhs.value() == rhs.value();
75 }
76 
77 template<typename Tag>
78 bool operator!=( Kmer<Tag> const& lhs, Kmer<Tag> const& rhs )
79 {
80  return lhs.value() != rhs.value();
81 }
82 
83 template<typename Tag>
84 bool operator<=( Kmer<Tag> const& lhs, Kmer<Tag> const& rhs )
85 {
86  return lhs.value() <= rhs.value();
87 }
88 
89 template<typename Tag>
90 bool operator>=( Kmer<Tag> const& lhs, Kmer<Tag> const& rhs )
91 {
92  return lhs.value() >= rhs.value();
93 }
94 
95 template<typename Tag>
96 bool operator<( Kmer<Tag> const& lhs, Kmer<Tag> const& rhs )
97 {
98  return lhs.value() < rhs.value();
99 }
100 
101 template<typename Tag>
102 bool operator>( Kmer<Tag> const& lhs, Kmer<Tag> const& rhs )
103 {
104  return lhs.value() > rhs.value();
105 }
106 
107 template<typename Tag>
108 std::string to_string( Kmer<Tag> const& kmer )
109 {
110  auto result = std::string( kmer.k(), 'X' );
111  for( size_t i = 0; i < kmer.k(); ++i ) {
112  switch( kmer[i] ) {
113  case 0x00: {
114  result[i] = 'A';
115  break;
116  }
117  case 0x01: {
118  result[i] = 'C';
119  break;
120  }
121  case 0x02: {
122  result[i] = 'G';
123  break;
124  }
125  case 0x03: {
126  result[i] = 'T';
127  break;
128  }
129  }
130  }
131  return result;
132 }
133 
134 template<typename Tag>
135 std::ostream& operator<< ( std::ostream& os, Kmer<Tag> const& kmer )
136 {
137  os << to_string( kmer );
138  return os;
139 }
140 
141 // =================================================================================================
142 // Nucleotide Kmer Functions
143 // =================================================================================================
144 
145 template<typename Tag>
147 {
148  // Function is written for a specific bit width.
149  static_assert(
150  std::is_same<typename Kmer<Tag>::WordType, std::uint64_t>::value,
151  "Kmer::WordType != uint64_t"
152  );
153 
154  // Adapted from Kraken2 at https://github.com/DerrickWood/kraken2/blob/master/src/mmscanner.cc
155  // which itself adapted this for 64-bit DNA use from public domain code at
156  // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
157 
158  // Reverse bits (leaving bit pairs intact, as those represent nucleotides):
159  // Swap consecutive pairs, then nibbles, then bytes, then byte pairs, then halves of 64-bit word
160  auto value = kmer.value();
161  value = (( value & 0xCCCCCCCCCCCCCCCCUL ) >> 2) | (( value & 0x3333333333333333UL ) << 2 );
162  value = (( value & 0xF0F0F0F0F0F0F0F0UL ) >> 4) | (( value & 0x0F0F0F0F0F0F0F0FUL ) << 4 );
163  value = (( value & 0xFF00FF00FF00FF00UL ) >> 8) | (( value & 0x00FF00FF00FF00FFUL ) << 8 );
164  value = (( value & 0xFFFF0000FFFF0000UL ) >> 16) | (( value & 0x0000FFFF0000FFFFUL ) << 16 );
165  value = ( value >> 32 ) | ( value << 32);
166 
167  // Finally, complement, and shift to correct position, removing the invalid lower bits.
168  return Kmer<Tag>( (~value) >> ( Kmer<Tag>::BIT_SIZE - kmer.k() * Kmer<Tag>::BITS_PER_CHAR ));
169 }
170 
171 template<typename Tag>
173 {
174  auto rev_comp = reverse_complement( kmer );
175  return kmer < rev_comp ? kmer : rev_comp;
176 }
177 
178 } // namespace sequence
179 } // namespace genesis
180 
181 #endif // include guard
kmer.hpp
genesis::sequence::operator!=
bool operator!=(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:78
genesis::sequence::operator>
bool operator>(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:102
genesis::sequence::Kmer
Kmer class template for representing k-mers of various sizes, currently up to k-32.
Definition: kmer.hpp:69
genesis::sequence::operator<
bool operator<(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:96
genesis::sequence::operator<<
std::ostream & operator<<(std::ostream &out, Sequence const &seq)
Print a Sequence to an ostream in the form "label: sites".
Definition: sequence/functions/functions.cpp:449
genesis::sequence::number_of_kmers
size_t number_of_kmers(size_t k, size_t alphabet_size=4)
Compute the total number of possible k-mers for a given k and alphabet_size.
Definition: function.hpp:56
genesis::sequence::Kmer::k
static uint8_t k()
Definition: kmer.hpp:117
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::operator<=
bool operator<=(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:84
genesis::sequence::canonical_representation
Kmer< Tag > canonical_representation(Kmer< Tag > const &kmer)
Definition: function.hpp:172
genesis::sequence::to_string
std::string to_string(Kmer< Tag > const &kmer)
Definition: function.hpp:108
genesis::sequence::reverse_complement
std::string reverse_complement(std::string const &sequence, bool accept_degenerated)
Get the reverse complement of a nucleic acid sequence.
Definition: codes.cpp:501
genesis::sequence::Kmer::WordType
uint64_t WordType
Underlying integer type used to store the k-mer.
Definition: kmer.hpp:80
genesis::sequence::Kmer::value
WordType const & value() const
Definition: kmer.hpp:169
genesis::sequence::operator>=
bool operator>=(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:90
genesis::sequence::operator==
bool operator==(Kmer< Tag > const &lhs, Kmer< Tag > const &rhs)
Definition: function.hpp:72