A toolkit for working with phylogenetic data.
v0.24.0
sequence/functions/stats.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <array>
40 #include <cassert>
41 
42 namespace genesis {
43 namespace sequence {
44 
45 // =================================================================================================
46 // Characteristics
47 // =================================================================================================
48 
49 // -------------------------------------------------------------------------
50 // Site Histogram
51 // -------------------------------------------------------------------------
52 
53 std::map<char, size_t> site_histogram( Sequence const& seq )
54 {
55  // We do a detour via an array, as this has way faster access times.
56 
57  // Init the array.
58  std::array<size_t, 256> freqs;
59  for( size_t i = 0; i < freqs.size(); ++i ) {
60  freqs[i] = 0;
61  }
62 
63  // Count frequencies.
64  for( auto const& s : seq ) {
65  ++freqs[s];
66  }
67 
68  // Convert to final format.
69  std::map<char, size_t> result;
70  for( size_t i = 0; i < freqs.size(); ++i ) {
71  if( freqs[i] > 0 ) {
72  result[i] = freqs[i];
73  }
74  }
75  return result;
76 }
77 std::map<char, size_t> site_histogram( SequenceSet const& set )
78 {
79  // We do a detour via an array, as this has way faster access times.
80 
81  // Init the array.
82  std::array<size_t, 256> freqs;
83  for( size_t i = 0; i < freqs.size(); ++i ) {
84  freqs[i] = 0;
85  }
86 
87  // Count frequencies.
88  for( auto const& seq : set ) {
89  for( auto const& s : seq ) {
90  ++freqs[s];
91  }
92  }
93 
94  // Convert to final format.
95  std::map<char, size_t> result;
96  for( size_t i = 0; i < freqs.size(); ++i ) {
97  if( freqs[i] > 0 ) {
98  result[i] = freqs[i];
99  }
100  }
101  return result;
102 }
103 
104 // -------------------------------------------------------------------------
105 // Base Frequencies
106 // -------------------------------------------------------------------------
107 
111 static std::map<char, double> base_frequencies_accumulator(
112  std::map<char, size_t> const& sitehistogram,
113  std::string const& plain_chars
114 ) {
115  // Prepare result (do it here to facilitate copy elision).
116  std::map<char, double> result;
117 
118  // Calculate sum of raw counts of all chars given in plain_chars.
119  size_t sum = 0;
120  for( auto const& shp : sitehistogram ) {
121  if( plain_chars.find( shp.first ) != std::string::npos ) {
122  sum += shp.second;
123  }
124  }
125 
126  // Make relative.
127  for( auto const& pc : plain_chars ) {
128  if( sitehistogram.count( pc )) {
129  result[pc] = static_cast<double>( sitehistogram.at(pc) ) / static_cast<double>( sum );
130  }
131  }
132  return result;
133 }
134 
135 std::map<char, double> base_frequencies( Sequence const& seq, std::string const& plain_chars )
136 {
137  auto const sh = site_histogram( seq );
138  return base_frequencies_accumulator( sh, plain_chars );
139 }
140 
141 std::map<char, double> base_frequencies( SequenceSet const& set, std::string const& plain_chars )
142 {
143  auto const sh = site_histogram( set );
144  return base_frequencies_accumulator( sh, plain_chars );
145 }
146 
147 // -------------------------------------------------------------------------
148 // Char counting and validation
149 // -------------------------------------------------------------------------
150 
151 size_t count_chars( SequenceSet const& set, std::string const& chars )
152 {
153  // Init array to false, then set all necessary chars to true.
154  auto lookup = utils::CharLookup<bool>( false );
155  lookup.set_selection_upper_lower( chars, true );
156 
157  size_t counter = 0;
158  for( auto& s : set ) {
159  for( auto& c : s ) {
160  // get rid of this check and leave it to the parser/lexer/stream iterator
161  if( c < 0 ) {
162  continue;
163  }
164  assert( c >= 0 );
165  if( lookup[ c ] ) {
166  ++counter;
167  }
168  }
169  }
170  return counter;
171 }
172 
173 // -------------------------------------------------------------------------
174 // Gap Counting
175 // -------------------------------------------------------------------------
176 
177 double gapyness( SequenceSet const& set, std::string const& gap_chars )
178 {
179  size_t gaps = count_chars( set, gap_chars );
180  size_t len = total_length( set );
181  if( len == 0 ) {
182  return 0.0;
183  }
184 
185  double ret = static_cast<double>( gaps ) / static_cast<double>( len );
186  assert( 0.0 <= ret && ret <= 1.0 );
187  return ret;
188 }
189 
190 size_t gap_site_count( SiteCounts const& counts )
191 {
192  size_t res = 0;
193  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
194  bool all_gap_site = true;
195 
196  for( size_t char_idx = 0; char_idx < counts.characters().size(); ++char_idx ) {
197  all_gap_site &= ( counts.count_at( char_idx, site_idx ) == 0 );
198  }
199 
200  if( all_gap_site ) {
201  ++res;
202  }
203  }
204  return res;
205 }
206 
207 } // namespace sequence
208 } // namespace genesis
Store counts of the occurence for certain characters at the sites of Sequences.
Definition: counts.hpp:85
static std::map< char, double > base_frequencies_accumulator(std::map< char, size_t > const &sitehistogram, std::string const &plain_chars)
Local helper function that turns a site histogram into base frequencies.
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
size_t gap_site_count(SiteCounts const &counts)
double sum(const Histogram &h)
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
size_t count_chars(SequenceSet const &set, std::string const &chars)
Count the number of occurrences of the given chars within the sites of the SequenceSet.
std::map< char, size_t > site_histogram(Sequence const &seq)
Get a histogram of the occurrences of particular sites, given a Sequence.
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
Store a set of Sequences.
void set_selection_upper_lower(std::string const &chars, T value)
Set the lookup status for both the upper and lower case of all chars that are contained in a given st...
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
std::map< char, double > base_frequencies(Sequence const &seq, std::string const &plain_chars)
Get the base frequencies of the sites in a Sequence given the base chars.
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
double gapyness(SequenceSet const &set, std::string const &gap_chars)
Return the "gapyness" of the Sequences, i.e., the proportion of gap chars and other completely undete...