41 #include <unordered_map> 56 is_nucleic_acids_ = ( alphabet_ ==
"ACGT" );
59 if( alphabet_.size() == 0 ) {
60 throw std::invalid_argument(
"Invalid alphabet for kmer list." );
63 throw std::invalid_argument(
"Cannot store kmers for such large k." );
66 throw std::invalid_argument(
"Invalid k==0 for kmer list." );
71 for(
size_t i = 0; i < alphabet_.size(); ++i ) {
83 if( kmer_list_.size() > 0 ) {
92 kmer_list_ = std::vector<std::string>( p );
93 for(
size_t i = 0; i < p; ++i ) {
96 auto kmer = std::string( k_,
'#' );
97 assert( kmer.size() == k_ );
101 for(
size_t j = 0; j < k_; ++j ) {
102 assert(( 1 + j <= k_ ) && ( k_ - 1 - j < kmer.size() ));
104 kmer[ k_ - 1 - j ] = w[ c % w.size() ];
111 kmer_list_[ i ] = kmer;
125 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
129 if( rev_comp_map_.size() > 0 ) {
130 return rev_comp_map_;
135 rev_comp_map_ = std::vector<size_t>(
utils::int_pow( 4, k_ ), 0 );
139 std::unordered_map<std::string, size_t> done;
143 assert( rev_comp_map_.size() == list.size() );
147 for(
size_t i = 0; i < list.size(); ++i ) {
148 auto const& seq = list[i];
152 assert( done.count( seq ) == 0 );
158 if( done.count( rev ) == 0 ) {
159 auto const ni = done.size();
160 rev_comp_map_[ i ] = ni;
164 rev_comp_map_[ i ] = done[ rev ];
171 return rev_comp_map_;
177 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
181 if( rev_comp_indices_.size() > 0 ) {
182 return rev_comp_indices_;
189 rev_comp_indices_ = std::vector<size_t>( list.size(), std::numeric_limits<size_t>::max() );
192 for(
size_t i = 0; i < list.size(); ++i ) {
195 if( rev_comp_indices_[i] != std::numeric_limits<size_t>::max() ) {
200 auto const& seq = list[i];
206 rev_comp_indices_[i] = i;
212 auto const it = std::find( list.begin() + i, list.end(), rev );
213 assert( it != list.end() );
214 size_t const rci = std::distance( list.begin(), it );
215 assert( rci < rev_comp_indices_.size() );
218 rev_comp_indices_[ i ] = rci;
219 rev_comp_indices_[ rci ] = i;
223 return rev_comp_indices_;
229 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
233 if( rev_comp_list_.size() > 0 ) {
234 return rev_comp_list_;
241 assert( kl.size() == rci.size() );
242 assert( rcls <= kl.size() );
245 rev_comp_list_ = std::vector<std::string>( rcls,
"" );
248 for(
size_t i = 0; i < kl.size(); ++i ) {
251 auto const idx = rci[i];
252 assert( idx < rev_comp_list_.size() );
255 if( rev_comp_list_[ idx ].empty() ) {
256 rev_comp_list_[ idx ] = kl[ i ];
260 return rev_comp_list_;
266 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
275 if( with_palindromes ) {
bool is_valid_int_pow(size_t base, size_t exp)
Return whether the given power can be stored within a size_t.
std::vector< size_t > const & kmer_combined_reverse_complement_map() const
Get a map from indices of kmer_list() and signature_counts() vectors to a smaller list which combines...
size_t int_pow(size_t base, size_t exp)
Calculate the power base^exp for positive integer values.
std::vector< std::string > const & kmer_list() const
Return the list of all possible k-mers for the given k and alphabet.
size_t kmer_list_size() const
std::string normalize_code_alphabet(std::string const &alphabet)
Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates.
bool is_nucleic_acids() const
Speedup and shortcut to test whether the alphabet() is "ACGT".
static size_t const InvalidCharIndex
Value that is used to indicate an invalid (non-alphabet) char when using index_of().
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string const & alphabet() const
std::string reverse_complement(std::string const &sequence, bool accept_degenerated)
Get the reverse complement of a nucleic acid sequence.
std::vector< size_t > const & kmer_reverse_complement_indices() const
Get the indices for each kmer in kmer_list() to its reverse complement in the list.
size_t kmer_reverse_complement_list_size(bool with_palindromes=true) const
void set_char_upper_lower(char c, T value)
Set the lookup status for both the upper and lower case of a given char.
SignatureSpecifications()=default
std::vector< std::string > const & kmer_reverse_complement_list() const