A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sequence/functions/entropy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Site Entropy
47 // =================================================================================================
48 
49 double site_entropy(
50  SiteCounts const& counts,
51  size_t site_idx,
52  SiteEntropyOptions options
53 ) {
54  if( site_idx >= counts.length() ) {
55  throw std::runtime_error(
56  "Invalid site index for calculating site entropy: " + std::to_string( site_idx ) + "."
57  );
58  }
59 
60  // Prepare some constants (speedup).
61  auto const ln_2 = log( 2.0 );
62  auto const num_seqs = static_cast<double>( counts.added_sequences_count() );
63  auto const num_chars = counts.characters().size();
64 
65  // Results: we add up the entropy and the number of counts that we have seen in total.
66  double entropy = 0.0;
67  SiteCounts::CountsIntType counts_sum = 0;
68 
69  // Accumulate entropy and total counts for the site.
70  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
71  auto char_count = counts.count_at( char_idx, site_idx );
72  counts_sum += char_count;
73 
74  double char_prob = static_cast<double>( char_count ) / num_seqs;
75  if( char_prob > 0.0 ) {
76  entropy -= char_prob * log( char_prob ) / ln_2;
77  }
78  }
79 
80  // If we want to include gaps, add the entropy for the gap probability.
81  if( options & SiteEntropyOptions::kIncludeGaps ) {
82  assert( counts_sum <= num_seqs );
83  double gap_prob = 1.0 - ( static_cast<double>( counts_sum ) / num_seqs );
84  if( gap_prob > 0.0 ) {
85  entropy -= gap_prob * log( gap_prob ) / ln_2;
86  }
87  }
88 
89  // If we want to weight using the determined characters, use their proportion as a factor.
90  if( options & SiteEntropyOptions::kWeighted ) {
91  entropy *= ( static_cast<double>( counts_sum ) / num_seqs );
92  }
93 
94  // If we want to normalize, calculate the H_max for the used number of characters.
95  if( options & SiteEntropyOptions::kNormalized ) {
96  double hmax = 1.0;
97  if( options & SiteEntropyOptions::kIncludeGaps ) {
98  hmax = log( static_cast<double>( num_chars + 1 )) / ln_2;
99  } else {
100  hmax = log( static_cast<double>( num_chars )) / ln_2;
101  }
102  return entropy / hmax;
103 
104  } else {
105  return entropy;
106  }
107 }
108 
109 // =================================================================================================
110 // Site Information
111 // =================================================================================================
112 
114  SiteCounts const& counts,
115  size_t site_index,
116  bool use_small_sample_correction,
117  SiteEntropyOptions options
118 ) {
119  // Max possible entropy for the given number of characters in the counts object.
120  auto const num_chars = static_cast<double>( counts.characters().size() );
121  auto const log_num = log( num_chars ) / log( 2.0 );
122 
123  // Small sample correction.
124  double e = 0.0;
125  if( use_small_sample_correction ) {
126  // Approximation for the small-sample correction, according to
127  // https://en.wikipedia.org/wiki/Sequence_logo
128  e = ( 1.0 / log( 2.0 ) ) * (( num_chars - 1.0 ) / ( 2.0 * counts.added_sequences_count() ));
129  }
130 
131  // Result, using the entropy.
132  return log_num - site_entropy( counts, site_index, options ) - e;
133 }
134 
135 // =================================================================================================
136 // Absolute Entropy
137 // =================================================================================================
138 
140  SiteCounts const& counts,
141  SiteEntropyOptions per_site_options
142 ) {
143  double sum = 0.0;
144  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
145  sum += site_entropy( counts, site_idx, per_site_options );
146  }
147  return sum;
148 }
149 
150 // =================================================================================================
151 // Averaged Entropy
152 // =================================================================================================
153 
155  SiteCounts const& counts,
156  bool only_determined_sites,
157  SiteEntropyOptions per_site_options
158 ) {
159  // Counters.
160  double sum = 0.0;
161  size_t determined_sites = 0;
162 
163  // Consts for speedup.
164  auto const num_chars = counts.characters().size();
165 
166  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
167  sum += site_entropy( counts, site_idx, per_site_options );
168 
169  // Count determined sites.
170  if( only_determined_sites ) {
171  bool det = false;
172  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
173  det |= ( counts.count_at( char_idx, site_idx ) > 0 );
174  }
175  if( det ) {
176  ++determined_sites;
177  }
178  }
179  }
180 
181  if( only_determined_sites ) {
182  return sum / static_cast<double>( determined_sites );
183  } else {
184  return sum / static_cast<double>( counts.length() );
185  }
186 }
187 
188 } // namespace sequence
189 } // namespace genesis
In addition to the characters of the SiteCounts object, use the undetermined and gap characters...
Store counts of the occurence for certain characters at the sites of Sequences.
Definition: counts.hpp:85
double averaged_entropy(SiteCounts const &counts, bool only_determined_sites, SiteEntropyOptions per_site_options)
Return the averaged sum of all site entropies.
double sum(const Histogram &h)
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:373
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
Normalize the resulting entropy using the maximum entropy possible.
SiteEntropyOptions
Option flags to refine the calculation of site_entropy().
double absolute_entropy(SiteCounts const &counts, SiteEntropyOptions per_site_options)
Return the sum of all site entropies.
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
Weight the entropy using the summed relative frequencies of the characters.
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:101
double site_information(SiteCounts const &counts, size_t site_index, bool use_small_sample_correction, SiteEntropyOptions options)
Calculate the information content at one site of a SiteCounts object.
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
double site_entropy(SiteCounts const &counts, size_t site_idx, SiteEntropyOptions options)
Calculate the entropy at one site of a SiteCounts object.