A toolkit for working with phylogenetic data.
v0.19.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sequence/functions/stats.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <cassert>
40 
41 namespace genesis {
42 namespace sequence {
43 
44 // =================================================================================================
45 // Characteristics
46 // =================================================================================================
47 
48 // -------------------------------------------------------------------------
49 // Site Histogram
50 // -------------------------------------------------------------------------
51 
52 std::map<char, size_t> site_histogram( Sequence const& seq )
53 {
54  std::map<char, size_t> result;
55  for( auto const& s : seq ) {
56  ++result[s];
57  }
58  return result;
59 }
60 std::map<char, size_t> site_histogram( SequenceSet const& set )
61 {
62  std::map<char, size_t> result;
63  for( auto const& seq : set ) {
64  for( auto const& s : seq ) {
65  ++result[s];
66  }
67  }
68  return result;
69 }
70 
71 // -------------------------------------------------------------------------
72 // Base Frequencies
73 // -------------------------------------------------------------------------
74 
78 std::map<char, double> base_frequencies_accumulator(
79  std::map<char, size_t> const& sitehistogram,
80  std::string const& plain_chars
81 ) {
82  // Prepare result (do it here to facilitate copy elision).
83  std::map<char, double> result;
84 
85  // Calculate sum of raw counts of all chars given in plain_chars.
86  size_t sum = 0;
87  for( auto const& shp : sitehistogram ) {
88  if( plain_chars.find( shp.first ) != std::string::npos ) {
89  sum += shp.second;
90  }
91  }
92 
93  // Make relative.
94  for( auto const& pc : plain_chars ) {
95  if( sitehistogram.count( pc )) {
96  result[pc] = static_cast<double>( sitehistogram.at(pc) ) / static_cast<double>( sum );
97  }
98  }
99  return result;
100 }
101 
102 std::map<char, double> base_frequencies( Sequence const& seq, std::string const& plain_chars )
103 {
104  auto const sh = site_histogram( seq );
105  return base_frequencies_accumulator( sh, plain_chars );
106 }
107 
108 std::map<char, double> base_frequencies( SequenceSet const& set, std::string const& plain_chars )
109 {
110  auto const sh = site_histogram( set );
111  return base_frequencies_accumulator( sh, plain_chars );
112 }
113 
114 // -------------------------------------------------------------------------
115 // Char counting and validation
116 // -------------------------------------------------------------------------
117 
118 size_t count_chars( SequenceSet const& set, std::string const& chars )
119 {
120  // Init array to false, then set all necessary chars to true.
121  auto lookup = utils::CharLookup<bool>( false );
122  lookup.set_selection_upper_lower( chars, true );
123 
124  size_t counter = 0;
125  for( auto& s : set ) {
126  for( auto& c : s ) {
127  // get rid of this check and leave it to the parser/lexer/stream iterator
128  if( c < 0 ) {
129  continue;
130  }
131  assert( c >= 0 );
132  if( lookup[ c ] ) {
133  ++counter;
134  }
135  }
136  }
137  return counter;
138 }
139 
140 // -------------------------------------------------------------------------
141 // Gap Counting
142 // -------------------------------------------------------------------------
143 
144 double gapyness( SequenceSet const& set, std::string const& gap_chars )
145 {
146  size_t gaps = count_chars( set, gap_chars );
147  size_t len = total_length( set );
148  if( len == 0 ) {
149  return 0.0;
150  }
151 
152  double ret = static_cast<double>( gaps ) / static_cast<double>( len );
153  assert( 0.0 <= ret && ret <= 1.0 );
154  return ret;
155 }
156 
157 size_t gap_site_count( SiteCounts const& counts )
158 {
159  size_t res = 0;
160  for( size_t site_idx = 0; site_idx < counts.length(); ++site_idx ) {
161  bool all_gap_site = true;
162 
163  for( size_t char_idx = 0; char_idx < counts.characters().size(); ++char_idx ) {
164  all_gap_site &= ( counts.count_at( char_idx, site_idx ) == 0 );
165  }
166 
167  if( all_gap_site ) {
168  ++res;
169  }
170  }
171  return res;
172 }
173 
174 } // namespace sequence
175 } // namespace genesis
Store counts of the occurence for certain characters at the sites of Sequences.
Definition: counts.hpp:85
std::map< char, double > base_frequencies_accumulator(std::map< char, size_t > const &sitehistogram, std::string const &plain_chars)
Local helper function that turns a site histogram into base frequencies.
double gapyness(SequenceSet const &set, std::string const &gap_chars)
Return the "gapyness" of the Sequences, i.e., the proportion of gap chars and other completely undete...
size_t gap_site_count(SiteCounts const &counts)
double sum(const Histogram &h)
std::map< char, double > base_frequencies(Sequence const &seq, std::string const &plain_chars)
Get the base frequencies of the sites in a Sequence given the base chars.
size_t count_chars(SequenceSet const &set, std::string const &chars)
Count the number of occurrences of the given chars within the sites of the SequenceSet.
size_t total_length(SequenceSet const &set)
Return the total length (sum) of all Sequences in the SequenceSet.
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
Store a set of Sequences.
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
void set_selection_upper_lower(std::string const &chars, T value)
Set the lookup status for both the upper and lower case of all chars that are contained in a given st...
std::map< char, size_t > site_histogram(Sequence const &seq)
Get a histogram of the occurrences of particular sites, given a Sequence.
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80