A library for working with phylogenetic and population genetic data.
v0.32.0
counts.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <limits>
41 #include <stdexcept>
42 
43 namespace genesis {
44 namespace sequence {
45 
46 // ================================================================================================
47 // Constructors and Rule of Five
48 // ================================================================================================
49 
50 SiteCounts::SiteCounts( std::string const& characters, size_t length )
51  : num_seqs_( 0 )
52 {
53  // Uppercase, sort, uniq the characters.
54  characters_ = normalize_code_alphabet( characters );
55 
56  // Add characters to lookup table, set all other to a max value indicating that this char
57  // is not part of the table.
58  lookup_.set_all( static_cast<unsigned char>( characters_.size() ));
59  for( size_t i = 0; i < characters_.size(); ++i ) {
60  auto c = characters_[i];
61  lookup_.set_char_upper_lower( c, static_cast<unsigned char>(i) );
62  }
63 
64  // Use number of chars and number of sites to init data matrix.
65  // We store the chars per row, and the sites per column (e.g., one column represents the 'A's
66  // for all sites, while one row represents the possible chars for one site).
67  // This way, we use cache locality when filling in data.
68  counts_ = utils::Matrix< CountsIntType >( length, characters_.size() );
69 }
70 
71 // ================================================================================================
72 // Accesors
73 // ================================================================================================
74 
75 size_t SiteCounts::length() const
76 {
77  return counts_.rows();
78 }
79 
80 std::string SiteCounts::characters() const
81 {
82  assert( counts_.cols() == characters_.size() );
83  return characters_;
84 }
85 
87 {
88  return num_seqs_;
89 }
90 
91 SiteCounts::CountsIntType SiteCounts::count_of( char character, size_t site_index ) const
92 {
93  if( site_index >= length() ) {
94  throw std::runtime_error(
95  "Invalid site index for retrieving count: " + std::to_string( site_index ) + "."
96  );
97  }
98 
99  auto char_idx = lookup_[ character ];
100  if( char_idx == characters_.size() ) {
101  throw std::runtime_error(
102  "Invalid character for retrieving count: '" + std::string( 1, character ) + "'."
103  );
104  }
105 
106  return counts_( site_index, char_idx );
107 }
108 
110  size_t character_index,
111  size_t site_index
112 ) const {
113  if( site_index >= counts_.rows() ) {
114  throw std::runtime_error(
115  "Invalid site index for retrieving count: " + std::to_string( site_index ) + "."
116  );
117  }
118  if( character_index > counts_.cols() ) {
119  throw std::runtime_error(
120  "Invalid character index for retrieving count: " + std::to_string( character_index ) + "."
121  );
122  }
123 
124  return counts_( site_index, character_index );
125 }
126 
127 // ================================================================================================
128 // Modifiers
129 // ================================================================================================
130 
131 void SiteCounts::add_sequence( Sequence const& sequence, bool use_abundance )
132 {
133  if( use_abundance ) {
134  add_sequence( sequence.sites(), static_cast<CountsIntType>( sequence.abundance() ));
135  } else {
136  add_sequence( sequence.sites(), 1 );
137  }
138 }
139 
140 void SiteCounts::add_sequence( std::string const& sites, CountsIntType weight )
141 {
142  if( num_seqs_ >= std::numeric_limits< CountsIntType >::max() - weight ) {
143  throw std::runtime_error(
144  "Cannot add Sequence to SiteCounts as it might lead to an overflow in the counts."
145  );
146  }
147  if( sites.size() != counts_.rows() ) {
148  throw std::runtime_error(
149  "Cannot add Sequence to SiteCounts if it has different number of sites: Expected "
150  + std::to_string( counts_.rows() ) + " sites, but sequence has "
151  + std::to_string( sites.size() ) + " sites."
152  );
153  }
154 
155  for( size_t site_idx = 0; site_idx < sites.size(); ++site_idx ) {
156  // Get the index of the char. If not found, this char is not to be counted, so continue.
157  auto char_idx = lookup_[ static_cast< size_t >( sites[ site_idx ] ) ];
158  if( char_idx == characters_.size() ) {
159  continue;
160  }
161 
162  // Increase the count at that index.
163  counts_( site_idx, char_idx ) += weight;
164  }
165 
166  // We finished a sequence. Add to the counter.
167  num_seqs_ += weight;
168 }
169 
170 void SiteCounts::add_sequences( SequenceSet const& sequences, bool use_abundances )
171 {
172  for( auto const& seq : sequences ) {
173  add_sequence( seq, use_abundances );
174  }
175 }
176 
178 {
179  characters_ = "";
180  lookup_.set_all( 0 );
181  counts_ = utils::Matrix< CountsIntType >();
182  num_seqs_ = 0;
183 }
184 
186 {
187  for( auto& e : counts_ ) {
188  e = 0;
189  }
190  num_seqs_ = 0;
191 }
192 
193 } // namespace sequence
194 } // namespace genesis
genesis::sequence::SiteCounts::SiteCounts
SiteCounts()=default
Default constructor.
genesis::utils::Matrix::cols
size_t cols() const
Definition: containers/matrix.hpp:181
genesis::sequence::SiteCounts::added_sequences_count
CountsIntType added_sequences_count() const
Return the number of processed Sequences, i.e., how many Sequences were added in total.
Definition: counts.cpp:86
genesis::sequence::SiteCounts::clear
void clear()
Clear the object, that is, delete everything.
Definition: counts.cpp:177
genesis::sequence::SiteCounts::count_of
CountsIntType count_of(char character, size_t site_index) const
Return the count of a specific character at a given site.
Definition: counts.cpp:91
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
counts.hpp
genesis::sequence::SiteCounts::add_sequences
void add_sequences(SequenceSet const &sequences, bool use_abundances=true)
Process a SequenceSet and add its counts to the existing ones for all contained Sequences.
Definition: counts.cpp:170
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
sequence_set.hpp
genesis::sequence::SiteCounts::add_sequence
void add_sequence(Sequence const &sequence, bool use_abundance=true)
Process a single Sequence and add its counts to the existing ones.
Definition: counts.cpp:131
genesis::utils::Matrix< CountsIntType >
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
genesis::sequence::normalize_code_alphabet
std::string normalize_code_alphabet(std::string const &alphabet)
Normalize an alphabet set of Sequence codes, i.e., make them upper case, sort them,...
Definition: codes.cpp:359
genesis::sequence::Sequence::abundance
size_t abundance() const
Definition: sequence/sequence.hpp:150
genesis::sequence::SiteCounts::length
size_t length() const
Return the number of sites used for counting.
Definition: counts.cpp:75
genesis::sequence::SiteCounts::clear_counts
void clear_counts()
Reset all counts to 0.
Definition: counts.cpp:185
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:53
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::sequence::SiteCounts::CountsIntType
uint32_t CountsIntType
Type of uint used for internally counting the freuqencies of Sequence sites.
Definition: counts.hpp:101
genesis::utils::CharLookup::set_char_upper_lower
void set_char_upper_lower(char c, T value)
Set the lookup status for both the upper and lower case of a given char.
Definition: char_lookup.hpp:135
genesis::utils::CharLookup::set_all
void set_all(T value)
Set the lookup status for all chars at once.
Definition: char_lookup.hpp:192
genesis::utils::Matrix::rows
size_t rows() const
Definition: containers/matrix.hpp:176
genesis::sequence::SiteCounts::count_at
CountsIntType count_at(size_t character_index, size_t site_index) const
Return the count for a character and a site, given their indices.
Definition: counts.cpp:109
genesis::sequence::SiteCounts::characters
std::string characters() const
Return the character set that is used for counting.
Definition: counts.cpp:80
sequence.hpp
codes.hpp