41 #include <unordered_map>
56 is_nucleic_acids_ = ( alphabet_ ==
"ACGT" );
59 if( alphabet_.size() == 0 ) {
60 throw std::invalid_argument(
"Invalid alphabet for kmer list." );
63 throw std::invalid_argument(
"Cannot store kmers for such large k." );
66 throw std::invalid_argument(
"Invalid k==0 for kmer list." );
71 for(
size_t i = 0; i < alphabet_.size(); ++i ) {
83 if( kmer_list_.size() > 0 ) {
92 kmer_list_ = std::vector<std::string>( p );
93 for(
size_t i = 0; i < p; ++i ) {
96 auto kmer = std::string( k_,
'#' );
97 assert( kmer.size() == k_ );
101 for(
size_t j = 0; j < k_; ++j ) {
102 assert(( 1 + j <= k_ ) && ( k_ - 1 - j < kmer.size() ));
104 kmer[ k_ - 1 - j ] = w[ c % w.size() ];
111 kmer_list_[ i ] = kmer;
125 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
129 if( rev_comp_map_.size() > 0 ) {
130 return rev_comp_map_;
135 rev_comp_map_ = std::vector<size_t>(
utils::int_pow( 4, k_ ), 0 );
139 std::unordered_map<std::string, size_t> done;
143 assert( rev_comp_map_.size() == list.size() );
147 for(
size_t i = 0; i < list.size(); ++i ) {
148 auto const& seq = list[i];
152 assert( done.count( seq ) == 0 );
158 if( done.count( rev ) == 0 ) {
159 auto const ni = done.size();
160 rev_comp_map_[ i ] = ni;
164 rev_comp_map_[ i ] = done[ rev ];
171 return rev_comp_map_;
177 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
181 if( rev_comp_indices_.size() > 0 ) {
182 return rev_comp_indices_;
189 rev_comp_indices_ = std::vector<size_t>( list.size(), std::numeric_limits<size_t>::max() );
192 for(
size_t i = 0; i < list.size(); ++i ) {
195 if( rev_comp_indices_[i] != std::numeric_limits<size_t>::max() ) {
200 auto const& seq = list[i];
206 rev_comp_indices_[i] = i;
212 auto const it = std::find( list.begin() + i, list.end(), rev );
213 assert( it != list.end() );
214 size_t const rci = std::distance( list.begin(), it );
215 assert( rci < rev_comp_indices_.size() );
218 rev_comp_indices_[ i ] = rci;
219 rev_comp_indices_[ rci ] = i;
223 return rev_comp_indices_;
229 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
233 if( rev_comp_list_.size() > 0 ) {
234 return rev_comp_list_;
241 assert( kl.size() == rci.size() );
242 assert( rcls <= kl.size() );
245 rev_comp_list_ = std::vector<std::string>( rcls,
"" );
248 for(
size_t i = 0; i < kl.size(); ++i ) {
251 auto const idx = rci[i];
252 assert( idx < rev_comp_list_.size() );
255 if( rev_comp_list_[ idx ].empty() ) {
256 rev_comp_list_[ idx ] = kl[ i ];
260 return rev_comp_list_;
266 throw std::runtime_error(
"Reverse complement kmers only valid for nucleic acid codes." );
275 if( with_palindromes ) {