54 if( site_idx >= counts.
length() ) {
55 throw std::runtime_error(
56 "Invalid site index for calculating site entropy: " +
std::to_string( site_idx ) +
"."
61 auto const ln_2 = log( 2.0 );
63 auto const num_chars = counts.
characters().size();
70 for(
size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
71 auto char_count = counts.
count_at( char_idx, site_idx );
72 counts_sum += char_count;
74 double char_prob =
static_cast<double>( char_count ) / num_seqs;
75 if( char_prob > 0.0 ) {
76 entropy -= char_prob * log( char_prob ) / ln_2;
82 assert( counts_sum <= num_seqs );
83 double gap_prob = 1.0 - (
static_cast<double>( counts_sum ) / num_seqs );
84 if( gap_prob > 0.0 ) {
85 entropy -= gap_prob * log( gap_prob ) / ln_2;
91 entropy *= (
static_cast<double>( counts_sum ) / num_seqs );
97 if( options & SiteEntropyOptions::kIncludeGaps ) {
98 hmax = log( static_cast<double>( num_chars + 1 )) / ln_2;
100 hmax = log( static_cast<double>( num_chars )) / ln_2;
102 return entropy / hmax;
116 bool use_small_sample_correction,
120 auto const num_chars =
static_cast<double>( counts.
characters().size() );
121 auto const log_num = log( num_chars ) / log( 2.0 );
125 if( use_small_sample_correction ) {
132 return log_num -
site_entropy( counts, site_index, options ) - e;
144 for(
size_t site_idx = 0; site_idx < counts.
length(); ++site_idx ) {
145 sum +=
site_entropy( counts, site_idx, per_site_options );
156 bool only_determined_sites,
161 size_t determined_sites = 0;
164 auto const num_chars = counts.
characters().size();
166 for(
size_t site_idx = 0; site_idx < counts.
length(); ++site_idx ) {
167 sum +=
site_entropy( counts, site_idx, per_site_options );
170 if( only_determined_sites ) {
172 for(
size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
173 det |= ( counts.
count_at( char_idx, site_idx ) > 0 );
181 if( only_determined_sites ) {
182 return sum /
static_cast<double>( determined_sites );
184 return sum /
static_cast<double>( counts.
length() );
In addition to the characters of the SiteCounts object, use the undetermined and gap characters...
Store counts of the occurence for certain characters at the sites of Sequences.
double averaged_entropy(SiteCounts const &counts, bool only_determined_sites, SiteEntropyOptions per_site_options)
Return the averaged sum of all site entropies.
double sum(const Histogram &h)
std::string to_string(T const &v)
Return a string representation of a given value.
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Normalize the resulting entropy using the maximum entropy possible.
SiteEntropyOptions
Option flags to refine the calculation of site_entropy().
double absolute_entropy(SiteCounts const &counts, SiteEntropyOptions per_site_options)
Return the sum of all site entropies.
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Weight the entropy using the summed relative frequencies of the characters.
size_t length() const
Return the number of sites used for counting.
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
double site_information(SiteCounts const &counts, size_t site_index, bool use_small_sample_correction, SiteEntropyOptions options)
Calculate the information content at one site of a SiteCounts object.
std::string characters() const
Return the character set that is used for counting.
double site_entropy(SiteCounts const &counts, size_t site_idx, SiteEntropyOptions options)
Calculate the entropy at one site of a SiteCounts object.