A library for working with phylogenetic and population genetic data.
v0.32.0
sequence_dict.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_SEQUENCE_DICT_H_
2 #define GENESIS_SEQUENCE_SEQUENCE_DICT_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
35 
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <stdexcept>
42 #include <string>
43 #include <unordered_map>
44 #include <vector>
45 
46 namespace genesis {
47 namespace sequence {
48 
64 {
65 public:
66 
67  // -------------------------------------------------------------------------
68  // Typedefs and Enums
69  // -------------------------------------------------------------------------
70 
71  struct Entry
72  {
73  std::string name;
74  size_t length = 0;
75 
76  // /**
77  // * @brief Alias to get the @p name of the Entry.
78  // */
79  // inline std::string const& name() const
80  // {
81  // return name;
82  // }
83 
87  inline std::string const& label() const
88  {
89  return name;
90  }
91 
92  // /**
93  // * @brief Alias to get the @p length of the Entry.
94  // */
95  // inline size_t length() const
96  // {
97  // return length;
98  // }
99 
103  inline size_t size() const
104  {
105  return length;
106  }
107  };
108 
109  using const_iterator = std::vector<Entry>::const_iterator;
110 
111  // -------------------------------------------------------------------------
112  // Constructors and Rule of Five
113  // -------------------------------------------------------------------------
114 
115  SequenceDict() = default;
116  ~SequenceDict() = default;
117 
118  SequenceDict( SequenceDict const& ) = default;
119  SequenceDict( SequenceDict&& ) = default;
120 
121  SequenceDict& operator= ( SequenceDict const& ) = default;
122  SequenceDict& operator= ( SequenceDict&& ) = default;
123 
124  // -------------------------------------------------------------------------
125  // Modifiers
126  // -------------------------------------------------------------------------
127 
133  void add( Sequence const& sequence, bool also_look_up_first_word = true )
134  {
135  add( sequence.label(), sequence.length(), also_look_up_first_word );
136  }
137 
143  void add( std::string const& name, size_t length, bool also_look_up_first_word = true )
144  {
145  Entry e;
146  e.name = name;
147  e.length = length;
148  add( std::move( e ), also_look_up_first_word );
149  }
150 
156  void add( Entry const& entry, bool also_look_up_first_word = true )
157  {
158  add( Entry{ entry }, also_look_up_first_word);
159  }
160 
171  void add( Entry&& entry, bool also_look_up_first_word = true )
172  {
173  // Check for duplicates. As we are using the unordered map for indicies anyway,
174  // we can just rely on that, and do not have to check the vector again.
175  if( indices_.count( entry.name ) > 0 ) {
176  throw std::runtime_error(
177  "Cannot add duplicate sequence name \"" + entry.name + "\" to SequenceDict."
178  );
179  }
180  assert( indices_.count( entry.name ) == 0 );
181 
182  // Do the same for the first-word form as well. We always compute the label here,
183  // even if not used later, so that we can do the check before actually modifying our content.
184  // Slightly cleaner that way.
185  auto const label2 = utils::split( entry.name, "\t " )[0];
186  if( also_look_up_first_word && indices_.count( label2 ) > 0 ) {
187  throw std::runtime_error(
188  "Cannot add duplicate sequence name \"" + label2 + "\" to SequenceDict, "
189  "which is the shortened version of the original name \"" + entry.name + "\"."
190  );
191  }
192 
193  // We first create the index map entry, before adding the entry to the dict.
194  // In that order, the index that we want to store in the map is the current size of the
195  // vector pre insertion. We do the same for the shortened name as well, if wanted.
196  indices_[ entry.name ] = entries_.size();
197  if( also_look_up_first_word ) {
198  indices_[ label2 ] = entries_.size();
199  }
200  entries_.push_back( entry );
201  assert( indices_.count( entry.name ) == 1 && indices_[ entry.name ] == entries_.size() - 1 );
202  assert(
203  ! also_look_up_first_word ||
204  ( indices_.count( label2 ) == 1 && indices_[ label2 ] == entries_.size() - 1 )
205  );
206  }
207 
208  void clear()
209  {
210  entries_.clear();
211  indices_.clear();
212  }
213 
214  // -------------------------------------------------------------------------
215  // Accessors
216  // -------------------------------------------------------------------------
217 
218  size_t size() const
219  {
220  return entries_.size();
221  }
222 
223  Entry const& operator [] (size_t index) const
224  {
225  return entries_[index];
226  }
227 
228  Entry const& at (size_t index) const
229  {
230  return entries_.at(index);
231  }
232 
233  Entry const& get( std::string const& name ) const
234  {
235  auto const idx = index_of( name );
236  assert( idx < entries_.size() );
237  return entries_[ idx ];
238  }
239 
240  size_t index_of( std::string const& name ) const
241  {
242  auto const it = indices_.find( name );
243  if( it == indices_.end() ) {
244  throw std::runtime_error(
245  "Sequence name \"" + name + "\" not found in SequenceDict."
246  );
247  }
248  assert( entries_[it->second].name == name );
249  return it->second;
250  }
251 
252  bool contains( std::string const& name ) const
253  {
254  return indices_.count( name ) > 0;
255  }
256 
257  const_iterator find( std::string const& name ) const
258  {
259  auto const it = indices_.find( name );
260  if( it == indices_.end() ) {
261  return entries_.end();
262  }
263  assert( entries_[it->second].name == name );
264  return entries_.begin() + it->second;
265  }
266 
267  // -------------------------------------------------------------------------
268  // Iterators
269  // -------------------------------------------------------------------------
270 
272  {
273  return entries_.cbegin();
274  }
275 
277  {
278  return entries_.cend();
279  }
280 
281  // -------------------------------------------------------------------------
282  // Data Members
283  // -------------------------------------------------------------------------
284 
285 private:
286 
287  // vector of entries, and a fast-ish lookup from names to the index in the vector.
288  std::vector<Entry> entries_;
289  std::unordered_map<std::string, size_t> indices_;
290 
291 };
292 
293 } // namespace sequence
294 } // namespace genesis
295 
296 #endif // include guard
genesis::sequence::SequenceDict::Entry::name
std::string name
Definition: sequence_dict.hpp:73
genesis::sequence::SequenceDict::add
void add(Sequence const &sequence, bool also_look_up_first_word=true)
Add a Sequence to the dictionary.
Definition: sequence_dict.hpp:133
genesis::sequence::SequenceDict::Entry::length
size_t length
Definition: sequence_dict.hpp:74
genesis::sequence::SequenceDict::contains
bool contains(std::string const &name) const
Definition: sequence_dict.hpp:252
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::sequence::SequenceDict::find
const_iterator find(std::string const &name) const
Definition: sequence_dict.hpp:257
genesis::sequence::SequenceDict::operator[]
Entry const & operator[](size_t index) const
Definition: sequence_dict.hpp:223
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
genesis::sequence::SequenceDict::~SequenceDict
~SequenceDict()=default
genesis::sequence::SequenceDict::const_iterator
std::vector< Entry >::const_iterator const_iterator
Definition: sequence_dict.hpp:109
genesis::sequence::SequenceDict::size
size_t size() const
Definition: sequence_dict.hpp:218
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::sequence::SequenceDict::Entry::size
size_t size() const
Alias to get the length of the Entry.
Definition: sequence_dict.hpp:103
string.hpp
Provides some commonly used string utility functions.
genesis::sequence::SequenceDict::SequenceDict
SequenceDict()=default
genesis::sequence::SequenceDict::begin
const_iterator begin() const
Definition: sequence_dict.hpp:271
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::sequence::SequenceDict::add
void add(Entry &&entry, bool also_look_up_first_word=true)
Add an entry to the dictionary.
Definition: sequence_dict.hpp:171
genesis::sequence::SequenceDict::clear
void clear()
Definition: sequence_dict.hpp:208
genesis::sequence::Sequence::length
size_t length() const
Return the length (number of sites) of this sequence.
Definition: sequence/sequence.hpp:167
genesis::sequence::SequenceDict::at
Entry const & at(size_t index) const
Definition: sequence_dict.hpp:228
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SequenceDict::add
void add(std::string const &name, size_t length, bool also_look_up_first_word=true)
Add an entry to the dictionary, given its name and length.
Definition: sequence_dict.hpp:143
char.hpp
genesis::sequence::SequenceDict::end
const_iterator end() const
Definition: sequence_dict.hpp:276
genesis::sequence::SequenceDict::Entry::label
std::string const & label() const
Alias to get the name of the Entry.
Definition: sequence_dict.hpp:87
genesis::sequence::SequenceDict::operator=
SequenceDict & operator=(SequenceDict const &)=default
genesis::sequence::SequenceDict::add
void add(Entry const &entry, bool also_look_up_first_word=true)
Add an entry to the dictionary.
Definition: sequence_dict.hpp:156
genesis::sequence::SequenceDict::get
Entry const & get(std::string const &name) const
Definition: sequence_dict.hpp:233
genesis::sequence::SequenceDict::Entry
Definition: sequence_dict.hpp:71
genesis::sequence::SequenceDict::index_of
size_t index_of(std::string const &name) const
Definition: sequence_dict.hpp:240
sequence.hpp