|
A library for working with phylogenetic and population genetic data.
v0.32.0
|
|
Go to the documentation of this file.
48 #include <unordered_map>
63 auto const ws = w.size();
69 auto result = std::vector<size_t>( p, 0 );
72 if( sequence.
size() < settings.
k() ) {
82 for(
size_t pos = 0; pos < sequence.
size(); ++pos ) {
83 auto const cur = settings.
char_index( sequence[pos] );
91 throw std::runtime_error(
92 "Unknown Sequence char for kmer counting: '" +
93 std::string( 1, sequence[pos] ) +
"'"
108 if( valids >= settings.
k() ) {
109 assert( index < result.size() );
124 auto const sum =
static_cast<double>( std::accumulate( kmers.begin(), kmers.end(), 0 ));
127 auto freqs = std::vector<double>( kmers.size() );
128 for(
size_t i = 0; i < kmers.size(); ++i ) {
129 freqs[i] =
static_cast<double>( kmers[i] ) /
sum;
143 std::vector<T>
const& kmers,
147 assert( kmers.size() == map.size() );
150 auto result = std::vector<T>( size, {} );
152 for(
size_t i = 0; i < kmers.size(); ++i ) {
153 assert( map[i] < result.size() );
154 result[ map[i] ] += kmers[i];
209 template<
class Combinator>
213 Combinator combinator
220 auto result = std::vector<double>();
224 assert( freqs.size() == indices.size() );
225 for(
size_t i = 0; i < freqs.size(); ++i ) {
228 if( indices[i] == i ) {
233 if( indices[i] < i ) {
238 auto const combined = combinator( freqs[i], freqs[ indices[i] ] );
239 result.push_back( combined );
251 [](
double ff,
double fc ){
252 return std::min( ff, fc );
264 [](
double ff,
double fc ){
265 return std::max( ff, fc );
276 auto result = std::vector<double>();
280 assert( freqs.size() == indices.size() );
281 for(
size_t i = 0; i < freqs.size(); ++i ) {
284 if( indices[i] != i ) {
287 result.push_back( freqs[i] );
303 [](
double ff,
double fc ){
304 assert( ff >= 0.0 && fc >= 0.0 );
305 if( ff == 0.0 && fc == 0.0 ) {
307 }
else if( ff == 0.0 || fc == 0.0 ) {
310 return std::min( ff / fc, fc / ff );
323 [](
double ff,
double fc ){
324 assert( ff >= 0.0 && fc >= 0.0 );
325 if( ff == 0.0 && fc == 0.0 ) {
328 return std::min( ff, fc ) / ( ff + fc );
341 [](
double ff,
double fc ){
342 assert( ff >= 0.0 && fc >= 0.0 );
343 auto const s1 = ff * log( ( 2 * ff ) / ( ff + fc ) );
344 auto const s2 = fc * log( ( 2 * fc ) / ( ff + fc ) );
363 for(
size_t i = start; i < start + settings.
k(); ++i ) {
371 throw std::runtime_error(
372 "Unknown Sequence char for kmer string: '" +
373 std::string( 1, sequence[ i ] ) +
"'"
380 out << sequence[ i ];
392 auto const k = settings.
k();
396 if( sequence.
size() < k ) {
400 for(
size_t i = 0; i < sequence.
size() - k + 1; ++i ) {
417 auto const k = settings.
k();
418 for(
size_t i =
offset; i + k - 1 < sequence.
size(); i += k ) {
430 std::ostringstream out;
440 if( sequence.
size() < settings.
k() ) {
452 std::vector<std::string> result;
461 std::ostringstream out;
463 result.push_back( out.str() );
size_t kmer_list_size() const
std::string kmer_string_overlapping(Sequence const &sequence, SignatureSpecifications const &settings)
Return the sequence spitted into overlapping k-mers.
std::vector< double > signature_reverse_identity_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the signature of a sequence that uses only the frequencies of k-mers whose reverse compleme...
double sum(const Histogram &h)
std::vector< size_t > signature_symmetrized_ranks(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the symmetrized rank signature of a sequence according to the settings.
static const size_t InvalidCharIndex
Value that is used to indicate an invalid (non-alphabet) char when using index_of().
std::vector< size_t > signature_symmetrized_counts(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the symmetrized counts of the sequence according to the settings.
std::vector< double > signature_complementarity_frequencies_helper(Sequence const &sequence, SignatureSpecifications const &settings, Combinator combinator)
Local helper function that returns a vector where the frequencies of the non-palindromic kmers are co...
std::vector< size_t > signature_ranks(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the rank signature of a sequence according to the settings.
std::vector< double > signature_maximal_complementarity_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the signature of a sequence that uses the maximum frequency of reverse complement k-mers.
void offset(Histogram &h, double value)
static void kmer_string_overlapping_line(Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out)
Local helper function that writes an overlapping kmer string to a stream.
std::vector< double > signature_frequency_ratios_1(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the ratio 1 signature of a sequence.
UnknownCharBehavior unknown_char_behavior() const
size_t kmer_reverse_complement_list_size(bool with_palindromes=true) const
std::vector< double > signature_frequency_ratios_2(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the ratio 2 signature of a sequence.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
@ kThrow
Throw an exception.
std::vector< size_t > signature_counts(Sequence const &sequence, SignatureSpecifications const &settings)
Count the occurences of k-mers in the sequence according to the settings.
std::string const & alphabet() const
std::vector< T > signature_symmetrized_helper(std::vector< T > const &kmers, SignatureSpecifications const &settings)
Local helper function that adds up the values for reverse complement k-mers.
size_t size() const
Alias for length().
std::vector< double > signature_jensen_shannon(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the Jensen-Shannon (JS) signature of a sequence.
static void kmer_string_single_kmer(Sequence const &sequence, SignatureSpecifications const &settings, size_t start, std::ostream &out)
Local helper function that writes one kmer string to a stream.
std::vector< std::string > kmer_strings_non_overlapping(Sequence const &sequence, SignatureSpecifications const &settings)
Return the sequence spitted into a set of non-overlapping k-mers.
std::vector< double > signature_minimal_complementarity_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the signature of a sequence that uses the minimum frequency of reverse complement k-mers.
static void kmer_strings_non_overlapping_line(Sequence const &sequence, SignatureSpecifications const &settings, std::ostream &out, size_t offset)
Local helper function that does one line of a non overlapping kmer string.
std::vector< size_t > const & kmer_reverse_complement_indices() const
Get the indices for each kmer in kmer_list() to its reverse complement in the list.
std::vector< size_t > ranking_standard(RandomAccessIterator first, RandomAccessIterator last)
Return the ranking of the values in the given range, using Standard competition ranking ("1224" ranki...
size_t char_index(char c) const
Return the index of a char within the alphabet().
Specifications for calculating signatures (like k-mer counts) from Sequences.
@ kSkip
Simply ignore the char by skipping it.
std::vector< double > signature_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calculate the frequencies of occurences of k-mers in the sequence according to the settings.
std::vector< double > signature_symmetrized_frequencies(Sequence const &sequence, SignatureSpecifications const &settings)
Calcuate the symmetrized counts of the sequence according to the settings.
std::vector< size_t > const & kmer_combined_reverse_complement_map() const
Get a map from indices of kmer_list() and signature_counts() vectors to a smaller list which combines...