A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sequence/functions/entropy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Site Entropy
47 // =================================================================================================
48 
49 double site_entropy(
50  SequenceCounts const& counts,
51  size_t site_idx,
52  SiteEntropyOptions options
53 ) {
54  if( site_idx >= counts.length() ) {
55  throw std::runtime_error(
56  "Invalid site index for calculating site entropy: " + std::to_string( site_idx ) + "."
57  );
58  }
59 
60  // Prepare some constants (speedup).
61  auto const ln_2 = log( 2.0 );
62  auto const num_seqs = static_cast<double>( counts.added_sequences_count() );
63  auto const num_chars = counts.characters().size();
64 
65  // Results: we add up the entropy and the number of counts that we have seen in total.
66  double entropy = 0.0;
67  SequenceCounts::CountsIntType counts_sum = 0;
68 
69  // Accumulate entropy and total counts for the site.
70  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
71  auto char_count = counts.count_at( char_idx, site_idx );
72  counts_sum += char_count;
73 
74  double char_prob = static_cast<double>( char_count ) / num_seqs;
75  if( char_prob > 0.0 ) {
76  entropy -= char_prob * log( char_prob ) / ln_2;
77  }
78  }
79 
80  // If we want to include gaps, add the entropy for the gap probability.
81  if( options & SiteEntropyOptions::kIncludeGaps ) {
82  assert( counts_sum <= num_seqs );
83  double gap_prob = 1.0 - ( static_cast<double>( counts_sum ) / num_seqs );
84  if( gap_prob > 0.0 ) {
85  entropy -= gap_prob * log( gap_prob ) / ln_2;
86  }
87  }
88 
89  // If we want to weight using the determined characters, use their proportion as a factor.
90  if( options & SiteEntropyOptions::kWeighted ) {
91  entropy *= ( static_cast<double>( counts_sum ) / num_seqs );
92  }
93 
94  // If we want to normalize, calculate the H_max for the used number of characters.
95  if( options & SiteEntropyOptions::kNormalized ) {
96  double hmax = 1.0;
97  if( options & SiteEntropyOptions::kIncludeGaps ) {
98  hmax = log( static_cast<double>( num_chars + 1 )) / ln_2;
99  } else {
100  hmax = log( static_cast<double>( num_chars )) / ln_2;
101  }
102  return entropy / hmax;
103 
104  } else {
105  return entropy;
106  }
107 }
108 
109 // =================================================================================================
110 // Site Information
111 // =================================================================================================
112 
114  SequenceCounts const& counts,
115  size_t site_index,
116  bool use_small_sample_correction,
117  SiteEntropyOptions options
118 ) {
119  // Max possible entropy for the given number of characters in the counts object.
120  auto const num_chars = static_cast<double>( counts.characters().size() );
121  auto const log_num = log( num_chars ) / log( 2.0 );
122 
123  // Small sample correction.
124  double e = 0.0;
125  if( use_small_sample_correction ) {
126  // Approximation for the small-sample correction, according to
127  // https://en.wikipedia.org/wiki/Sequence_logo
128  e = ( 1.0 / log( 2.0 ) ) * (( num_chars - 1.0 ) / ( 2.0 * counts.added_sequences_count() ));
129  }
130 
131  // Result, using the entropy.
132  return log_num - site_entropy( counts, site_index, options ) - e;
133 }
134 
135 // =================================================================================================
136 // Absolute Entropy
137 // =================================================================================================
138 
140  SequenceCounts const& counts,
141  SiteEntropyOptions per_site_options
142 ) {
143  double sum = 0.0;
144  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
145  sum += site_entropy( counts, site_idx, per_site_options );
146  }
147  return sum;
148 }
149 
150 // =================================================================================================
151 // Averaged Entropy
152 // =================================================================================================
153 
155  SequenceCounts const& counts,
156  bool only_determined_sites,
157  SiteEntropyOptions per_site_options
158 ) {
159  // Counters.
160  double sum = 0.0;
161  size_t determined_sites = 0;
162 
163  // Consts for speedup.
164  auto const num_chars = counts.characters().size();
165 
166  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
167  sum += site_entropy( counts, site_idx, per_site_options );
168 
169  // Count determined sites.
170  if( only_determined_sites ) {
171  bool det = false;
172  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
173  det |= ( counts.count_at( char_idx, site_idx ) > 0 );
174  }
175  if( det ) {
176  ++determined_sites;
177  }
178  }
179  }
180 
181  if( only_determined_sites ) {
182  return sum / static_cast<double>( determined_sites );
183  } else {
184  return sum / static_cast<double>( counts.length() );
185  }
186 }
187 
188 } // namespace sequence
189 } // namespace genesis
In addition to the characters of the SequenceCounts object, use the undetermined and gap characters...
Store counts of the occurence for certain characters of Sequences.
Definition: counts.hpp:84
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
double site_entropy(SequenceCounts const &counts, size_t site_idx, SiteEntropyOptions options)
Calculate the entropy at one site of a SequenceCounts object.
double sum(const Histogram &h)
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:100
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
double site_information(SequenceCounts const &counts, size_t site_index, bool use_small_sample_correction, SiteEntropyOptions options)
Calculate the information content at one site of a SequenceCounts object.
double absolute_entropy(SequenceCounts const &counts, SiteEntropyOptions per_site_options)
Return the sum of all site entropies.
Normalize the resulting entropy using the maximum entropy possible.
SiteEntropyOptions
Option flags to refine the calculation of site_entropy().
double averaged_entropy(SequenceCounts const &counts, bool only_determined_sites, SiteEntropyOptions per_site_options)
Return the averaged sum of all site entropies.
Weight the entropy using the summed relative frequencies of the characters.
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109