A toolkit for working with phylogenetic data.
v0.22.1
sequence/functions/entropy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Site Entropy
47 // =================================================================================================
48 
49 double site_entropy(
50  SiteCounts const& counts,
51  size_t site_idx,
52  SiteEntropyOptions options
53 ) {
54  if( site_idx >= counts.length() ) {
55  throw std::runtime_error(
56  "Invalid site index for calculating site entropy: " + std::to_string( site_idx ) + "."
57  );
58  }
59 
60  // Prepare some constants (speedup).
61  auto const ln_2 = std::log( 2.0 );
62  auto const num_seqs = static_cast<double>( counts.added_sequences_count() );
63  auto const num_chars = counts.characters().size();
64 
65  // Results: we add up the entropy and the number of counts that we have seen in total.
66  double entropy = 0.0;
67  SiteCounts::CountsIntType counts_sum = 0;
68 
69  // Accumulate entropy and total counts for the site.
70  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
71  auto char_count = counts.count_at( char_idx, site_idx );
72  counts_sum += char_count;
73 
74  double char_prob = static_cast<double>( char_count ) / num_seqs;
75  if( char_prob > 0.0 ) {
76  entropy -= char_prob * std::log( char_prob ) / ln_2;
77  }
78  }
79 
80  // If we want to include gaps, add the entropy for the gap probability.
81  if( options & SiteEntropyOptions::kIncludeGaps ) {
82  assert( counts_sum <= num_seqs );
83  double gap_prob = 1.0 - ( static_cast<double>( counts_sum ) / num_seqs );
84  if( gap_prob > 0.0 ) {
85  entropy -= gap_prob * std::log( gap_prob ) / ln_2;
86  }
87  }
88 
89  // If we want to weight using the determined characters, use their proportion as a factor.
90  if( options & SiteEntropyOptions::kWeighted ) {
91  entropy *= ( static_cast<double>( counts_sum ) / num_seqs );
92  }
93 
94  // If we want to normalize, calculate the H_max for the used number of characters.
95  if( options & SiteEntropyOptions::kNormalized ) {
96  double hmax = 1.0;
97  if( options & SiteEntropyOptions::kIncludeGaps ) {
98  // Calculate log( num_chars + 1 )
99  hmax = std::log1p( static_cast<double>( num_chars )) / ln_2;
100  } else {
101  hmax = std::log( static_cast<double>( num_chars )) / ln_2;
102  }
103  return entropy / hmax;
104 
105  } else {
106  return entropy;
107  }
108 }
109 
110 // =================================================================================================
111 // Site Information
112 // =================================================================================================
113 
115  SiteCounts const& counts,
116  size_t site_index,
117  bool use_small_sample_correction,
118  SiteEntropyOptions options
119 ) {
120  // Max possible entropy for the given number of characters in the counts object.
121  auto const num_chars = static_cast<double>( counts.characters().size() );
122  auto const log_num = log( num_chars ) / log( 2.0 );
123 
124  // Small sample correction.
125  double e = 0.0;
126  if( use_small_sample_correction ) {
127  // Approximation for the small-sample correction, according to
128  // https://en.wikipedia.org/wiki/Sequence_logo
129  e = ( 1.0 / log( 2.0 ) ) * (( num_chars - 1.0 ) / ( 2.0 * counts.added_sequences_count() ));
130  }
131 
132  // Result, using the entropy.
133  return log_num - site_entropy( counts, site_index, options ) - e;
134 }
135 
136 // =================================================================================================
137 // Absolute Entropy
138 // =================================================================================================
139 
141  SiteCounts const& counts,
142  SiteEntropyOptions per_site_options
143 ) {
144  double sum = 0.0;
145  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
146  sum += site_entropy( counts, site_idx, per_site_options );
147  }
148  return sum;
149 }
150 
151 // =================================================================================================
152 // Averaged Entropy
153 // =================================================================================================
154 
156  SiteCounts const& counts,
157  bool only_determined_sites,
158  SiteEntropyOptions per_site_options
159 ) {
160  // Counters.
161  double sum = 0.0;
162  size_t determined_sites = 0;
163 
164  // Consts for speedup.
165  auto const num_chars = counts.characters().size();
166 
167  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
168  sum += site_entropy( counts, site_idx, per_site_options );
169 
170  // Count determined sites.
171  if( only_determined_sites ) {
172  bool det = false;
173  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
174  det |= ( counts.count_at( char_idx, site_idx ) > 0 );
175  }
176  if( det ) {
177  ++determined_sites;
178  }
179  }
180  }
181 
182  if( only_determined_sites ) {
183  return sum / static_cast<double>( determined_sites );
184  } else {
185  return sum / static_cast<double>( counts.length() );
186  }
187 }
188 
189 } // namespace sequence
190 } // namespace genesis
In addition to the characters of the SiteCounts object, use the undetermined and gap characters...
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
Store counts of the occurence for certain characters at the sites of Sequences.
Definition: counts.hpp:85
double absolute_entropy(SiteCounts const &counts, SiteEntropyOptions per_site_options)
Return the sum of all site entropies.
double sum(const Histogram &h)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:384
double site_entropy(SiteCounts const &counts, size_t site_idx, SiteEntropyOptions options)
Calculate the entropy at one site of a SiteCounts object.
Normalize the resulting entropy using the maximum entropy possible.
SiteEntropyOptions
Option flags to refine the calculation of site_entropy().
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
double site_information(SiteCounts const &counts, size_t site_index, bool use_small_sample_correction, SiteEntropyOptions options)
Calculate the information content at one site of a SiteCounts object.
Weight the entropy using the summed relative frequencies of the characters.
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:101
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
double averaged_entropy(SiteCounts const &counts, bool only_determined_sites, SiteEntropyOptions per_site_options)
Return the averaged sum of all site entropies.