A library for working with phylogenetic and population genetic data.
v0.27.0
sequence/functions/entropy.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
36 
37 #include <algorithm>
38 #include <cassert>
39 #include <cmath>
40 #include <stdexcept>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Per Site Entropy and Information
47 // =================================================================================================
48 
49 double site_entropy(
50  SiteCounts const& counts,
51  size_t site_idx,
52  SiteEntropyOptions options
53 ) {
54  if( site_idx >= counts.length() ) {
55  throw std::runtime_error(
56  "Invalid site index for calculating site entropy: " + std::to_string( site_idx ) + "."
57  );
58  }
59 
60  // Prepare some constants (speedup).
61  auto const ln_2 = std::log( 2.0 );
62  auto const num_seqs = static_cast<double>( counts.added_sequences_count() );
63  auto const num_chars = counts.characters().size();
64 
65  // Results: we add up the entropy and the number of counts that we have seen in total.
66  double entropy = 0.0;
67  SiteCounts::CountsIntType counts_sum = 0;
68 
69  // Accumulate entropy and total counts for the site.
70  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
71  auto char_count = counts.count_at( char_idx, site_idx );
72  counts_sum += char_count;
73 
74  double char_prob = static_cast<double>( char_count ) / num_seqs;
75  if( char_prob > 0.0 ) {
76  entropy -= char_prob * std::log( char_prob ) / ln_2;
77  }
78  }
79 
80  // If we want to include gaps, add the entropy for the gap probability.
81  if( options & SiteEntropyOptions::kIncludeGaps ) {
82  assert( counts_sum <= num_seqs );
83  double gap_prob = 1.0 - ( static_cast<double>( counts_sum ) / num_seqs );
84  if( gap_prob > 0.0 ) {
85  entropy -= gap_prob * std::log( gap_prob ) / ln_2;
86  }
87  }
88 
89  // If we want to weight using the determined characters, use their proportion as a factor.
90  if( options & SiteEntropyOptions::kWeighted ) {
91  entropy *= ( static_cast<double>( counts_sum ) / num_seqs );
92  }
93 
94  // If we want to normalize, calculate the H_max for the used number of characters.
95  if( options & SiteEntropyOptions::kNormalized ) {
96  double hmax = 1.0;
97  if( options & SiteEntropyOptions::kIncludeGaps ) {
98  // Calculate log( num_chars + 1 )
99  hmax = std::log1p( static_cast<double>( num_chars )) / ln_2;
100  } else {
101  hmax = std::log( static_cast<double>( num_chars )) / ln_2;
102  }
103  return entropy / hmax;
104 
105  } else {
106  return entropy;
107  }
108 }
109 
111  SiteCounts const& counts,
112  size_t site_index,
113  bool use_small_sample_correction,
114  SiteEntropyOptions options
115 ) {
116  // Max possible entropy for the given number of characters in the counts object.
117  auto const num_chars = static_cast<double>( counts.characters().size() );
118  auto const log_num = log( num_chars ) / log( 2.0 );
119 
120  // Small sample correction.
121  double e = 0.0;
122  if( use_small_sample_correction ) {
123  // Approximation for the small-sample correction, according to
124  // https://en.wikipedia.org/wiki/Sequence_logo
125  e = ( 1.0 / log( 2.0 ) ) * (( num_chars - 1.0 ) / ( 2.0 * counts.added_sequences_count() ));
126  }
127 
128  // Result, using the entropy.
129  return log_num - site_entropy( counts, site_index, options ) - e;
130 }
131 
132 // =================================================================================================
133 // Total Entropy and Information
134 // =================================================================================================
135 
137  SiteCounts const& counts,
138  SiteEntropyOptions per_site_options
139 ) {
140  double sum = 0.0;
141  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
142  sum += site_entropy( counts, site_idx, per_site_options );
143  }
144  return sum;
145 }
146 
148  SiteCounts const& counts,
149  bool only_determined_sites,
150  SiteEntropyOptions per_site_options
151 ) {
152  // Counters.
153  double sum = 0.0;
154  size_t determined_sites = 0;
155 
156  // Consts for speedup.
157  auto const num_chars = counts.characters().size();
158 
159  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
160  sum += site_entropy( counts, site_idx, per_site_options );
161 
162  // Count determined sites.
163  if( only_determined_sites ) {
164  bool det = false;
165  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
166  det |= ( counts.count_at( char_idx, site_idx ) > 0 );
167  }
168  if( det ) {
169  ++determined_sites;
170  }
171  }
172  }
173 
174  if( only_determined_sites ) {
175  return sum / static_cast<double>( determined_sites );
176  } else {
177  return sum / static_cast<double>( counts.length() );
178  }
179 }
180 
182  SiteCounts const& counts,
183  bool use_small_sample_correction,
184  SiteEntropyOptions per_site_options
185 ) {
186  double sum = 0.0;
187  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
188  sum += site_information( counts, site_idx, use_small_sample_correction, per_site_options );
189  }
190  return sum;
191 }
192 
194  SiteCounts const& counts,
195  bool only_determined_sites,
196  bool use_small_sample_correction,
197  SiteEntropyOptions per_site_options
198 ) {
199  // Counters.
200  double sum = 0.0;
201  size_t determined_sites = 0;
202 
203  // Consts for speedup.
204  auto const num_chars = counts.characters().size();
205 
206  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
207  sum += site_information( counts, site_idx, use_small_sample_correction, per_site_options );
208 
209  // Count determined sites.
210  if( only_determined_sites ) {
211  bool det = false;
212  for( size_t char_idx = 0; char_idx < num_chars; ++char_idx ) {
213  det |= ( counts.count_at( char_idx, site_idx ) > 0 );
214  }
215  if( det ) {
216  ++determined_sites;
217  }
218  }
219  }
220 
221  if( only_determined_sites ) {
222  return sum / static_cast<double>( determined_sites );
223  } else {
224  return sum / static_cast<double>( counts.length() );
225  }
226 }
227 
228 } // namespace sequence
229 } // namespace genesis
genesis::utils::sum
double sum(const Histogram &h)
Definition: utils/math/histogram/stats.cpp:140
genesis::sequence::SiteCounts::added_sequences_count
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
genesis::sequence::SiteEntropyOptions::kWeighted
@ kWeighted
Weight the entropy using the summed relative frequencies of the characters.
counts.hpp
sequence_set.hpp
genesis::sequence::absolute_entropy
double absolute_entropy(SiteCounts const &counts, SiteEntropyOptions per_site_options)
Return the sum of all site entropies.
Definition: sequence/functions/entropy.cpp:136
genesis::sequence::average_information
double average_information(SiteCounts const &counts, bool only_determined_sites, bool use_small_sample_correction, SiteEntropyOptions per_site_options)
Calculate the information content across all sites of a SiteCounts object.
Definition: sequence/functions/entropy.cpp:193
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: functions/genome_locus.hpp:48
genesis::sequence::SiteCounts
Store counts of the occurence for certain characters at the sites of Sequences.
Definition: counts.hpp:85
genesis::sequence::SiteCounts::length
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
genesis::sequence::SiteEntropyOptions::kNormalized
@ kNormalized
Normalize the resulting entropy using the maximum entropy possible.
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::site_entropy
double site_entropy(SiteCounts const &counts, size_t site_idx, SiteEntropyOptions options)
Calculate the entropy at one site of a SiteCounts object.
Definition: sequence/functions/entropy.cpp:49
genesis::sequence::SiteEntropyOptions
SiteEntropyOptions
Option flags to refine the calculation of site_entropy().
Definition: sequence/functions/entropy.hpp:70
genesis::sequence::site_information
double site_information(SiteCounts const &counts, size_t site_index, bool use_small_sample_correction, SiteEntropyOptions options)
Calculate the information content at one site of a SiteCounts object.
Definition: sequence/functions/entropy.cpp:110
genesis::sequence::SiteCounts::CountsIntType
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:101
genesis::sequence::average_entropy
double average_entropy(SiteCounts const &counts, bool only_determined_sites, SiteEntropyOptions per_site_options)
Return the average sum of all site entropies.
Definition: sequence/functions/entropy.cpp:147
entropy.hpp
genesis::sequence::absolute_information
double absolute_information(SiteCounts const &counts, bool use_small_sample_correction, SiteEntropyOptions per_site_options)
Calculate the information content across all sites of a SiteCounts object.
Definition: sequence/functions/entropy.cpp:181
genesis::sequence::SiteEntropyOptions::kIncludeGaps
@ kIncludeGaps
In addition to the characters of the SiteCounts object, use the undetermined and gap characters.
genesis::sequence::SiteCounts::count_at
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
genesis::sequence::SiteCounts::characters
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
sequence.hpp