A toolkit for working with phylogenetic data.
v0.18.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
counts.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2017 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
38 #include <algorithm>
39 #include <assert.h>
40 #include <limits>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace sequence {
45 
46 // ================================================================================================
47 // Constructors and Rule of Five
48 // ================================================================================================
49 
50 SequenceCounts::SequenceCounts( std::string const& characters, size_t length )
51  : num_seqs_( 0 )
52 {
53  // Uppercase, sort, uniq the characters.
54  characters_ = normalize_codes( characters );
55 
56  // Add characters to lookup table, set all other to a max value indicating that this char
57  // is not part of the table.
58  lookup_.set_all( characters_.size() );
59  for( size_t i = 0; i < characters_.size(); ++i ) {
60  auto c = characters_[i];
61  lookup_.set_char_upper_lower( c, i );
62  }
63 
64  // Use number of chars and number of sites to init data matrix.
65  // We store the chars per row, and the sites per column (e.g., one column represents the 'A's
66  // for all sites, while one row represents the possible chars for one site).
67  // This way, we use cache locality when filling in data.
68  counts_ = utils::Matrix< CountsIntType >( length, characters_.size() );
69 }
70 
71 // ================================================================================================
72 // Accesors
73 // ================================================================================================
74 
75 size_t SequenceCounts::length() const
76 {
77  return counts_.rows();
78 }
79 
80 std::string SequenceCounts::characters() const
81 {
82  assert( counts_.cols() == characters_.size() );
83  return characters_;
84 }
85 
87 {
88  return num_seqs_;
89 }
90 
91 SequenceCounts::CountsIntType SequenceCounts::count_of( char character, size_t site_index ) const
92 {
93  if( site_index >= length() ) {
94  throw std::runtime_error(
95  "Invalid site index for retrieving count: " + std::to_string( site_index ) + "."
96  );
97  }
98 
99  auto char_idx = lookup_[ character ];
100  if( char_idx == characters_.size() ) {
101  throw std::runtime_error(
102  "Invalid character for retrieving count: '" + std::string( 1, character ) + "'."
103  );
104  }
105 
106  return counts_( site_index, char_idx );
107 }
108 
110  size_t character_index,
111  size_t site_index
112 ) const {
113  if( site_index >= counts_.rows() ) {
114  throw std::runtime_error(
115  "Invalid site index for retrieving count: " + std::to_string( site_index ) + "."
116  );
117  }
118  if( character_index > counts_.cols() ) {
119  throw std::runtime_error(
120  "Invalid character index for retrieving count: " + std::to_string( character_index ) + "."
121  );
122  }
123 
124  return counts_( site_index, character_index );
125 }
126 
127 // ================================================================================================
128 // Modifiers
129 // ================================================================================================
130 
131 void SequenceCounts::add_sequence( Sequence const& sequence )
132 {
133  add_sequence( sequence.sites() );
134 }
135 
136 void SequenceCounts::add_sequence( std::string const& sites )
137 {
138  if( num_seqs_ == std::numeric_limits< CountsIntType >::max() ) {
139  throw std::runtime_error(
140  "Cannot add Sequence to SequenceCounts as it might lead to an overflow in the counts."
141  );
142  }
143  if( sites.size() != counts_.rows() ) {
144  throw std::runtime_error(
145  "Cannot add Sequence to SequenceCounts if it has different number of sites: Expected "
146  + std::to_string( counts_.rows() ) + " sites, but sequence has"
147  + std::to_string( sites.size() ) + " sites."
148  );
149  }
150 
151  for( size_t site_idx = 0; site_idx < sites.size(); ++site_idx ) {
152  // Get the index of the char. If not found, this char is not to be counted, so continue.
153  auto char_idx = lookup_[ static_cast< size_t >( sites[ site_idx ] ) ];
154  if( char_idx == characters_.size() ) {
155  continue;
156  }
157 
158  // Increase the count at that index.
159  ++counts_( site_idx, char_idx );
160  }
161 
162  // We finished a sequence. Add to the counter.
163  ++num_seqs_;
164 }
165 
167 {
168  for( auto const& seq : sequences ) {
169  add_sequence( seq );
170  }
171 }
172 
174 {
175  characters_ = "";
176  lookup_.set_all( 0 );
177  counts_ = utils::Matrix< CountsIntType >();
178  num_seqs_ = 0;
179 }
180 
182 {
183  for( auto& e : counts_ ) {
184  e = 0;
185  }
186  num_seqs_ = 0;
187 }
188 
189 } // namespace sequence
190 } // namespace genesis
size_t cols() const
Definition: matrix.hpp:156
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
void add_sequence(Sequence const &sequence)
Process a single Sequence and add its counts to the existing ones.
Definition: counts.cpp:131
void set_all(T value)
Set the lookup status for all chars at once.
CountsIntType count_of(char character, size_t site_index) const
Return the count of a specific character at a given site.
Definition: counts.cpp:91
std::string const & sites() const
Definition: sequence.cpp:72
void clear()
Clear the object, that is, delete everything.
Definition: counts.cpp:173
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:100
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
void add_sequences(SequenceSet const &sequences)
Process a SequenceSet and add its counts to the existing ones for all contained Sequences.
Definition: counts.cpp:166
std::string to_string(T const &v)
Return a string representation of a given value.
Definition: string.hpp:300
void clear_counts()
Reset all counts to 0.
Definition: counts.cpp:181
std::string normalize_codes(std::string const &alphabet)
Normalize a set of Sequence codes, i.e., make them upper case, sort them, and remove duplicates...
Definition: codes.cpp:345
SequenceCounts()=default
Default constructor.
Provides some commonly used string utility functions.
Store a set of Sequences.
size_t rows() const
Definition: matrix.hpp:151
void set_char_upper_lower(char c, T value)
Set the lookup status for both the upper and lower case of a given char.
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109