A library for working with phylogenetic and population genetic data.
v0.32.0
reference_genome.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_SEQUENCE_REFERENCE_GENOME_H_
2 #define GENESIS_SEQUENCE_REFERENCE_GENOME_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lczech@carnegiescience.edu>
23  Department of Plant Biology, Carnegie Institution For Science
24  260 Panama Street, Stanford, CA 94305, USA
25 */
26 
35 
39 
40 #include <cassert>
41 #include <limits>
42 #include <memory>
43 #include <mutex>
44 #include <stdexcept>
45 #include <string>
46 #include <unordered_map>
47 #include <utility>
48 #include <vector>
49 
50 namespace genesis {
51 namespace sequence {
52 
53 // =================================================================================================
54 // ReferenceGenome
55 // =================================================================================================
56 
66 {
67 public:
68 
69  // -------------------------------------------------------------------------
70  // Typedefs and Enums
71  // -------------------------------------------------------------------------
72 
73  using iterator = typename std::vector<Sequence>::iterator;
74  using const_iterator = typename std::vector<Sequence>::const_iterator;
75 
76  using reference = Sequence&;
77  using const_reference = Sequence const&;
78 
79  // -------------------------------------------------------------------------
80  // Constructors and Rule of Five
81  // -------------------------------------------------------------------------
82 
84  {
85  guard_ = genesis::utils::make_unique<std::mutex>();
86  }
87 
88  ~ReferenceGenome() = default;
89 
90  ReferenceGenome( ReferenceGenome const& ) = delete;
91  ReferenceGenome( ReferenceGenome&& ) = default;
92 
93  ReferenceGenome& operator= ( ReferenceGenome const& ) = delete;
95 
96  // -------------------------------------------------------------------------
97  // Accessors
98  // -------------------------------------------------------------------------
99 
103  size_t size() const
104  {
105  return sequences_.size();
106  }
107 
111  bool empty() const
112  {
113  return sequences_.empty();
114  }
115 
116  bool contains( std::string const& label ) const
117  {
118  return lookup_.count( label );
119  }
120 
125  const_iterator find( std::string const& label ) const
126  {
127  // Lock access to the cache. Released at the end of the scope.
128  assert( guard_ );
129  std::lock_guard<std::mutex> lock( *guard_ );
130 
131  // Try to get the sequence from the cache, for speed.
132  assert( cache_ == std::numeric_limits<size_t>::max() || cache_ < sequences_.size() );
133  if( cache_ != std::numeric_limits<size_t>::max() && sequences_[cache_].label() == label ) {
134  assert( cache_ < sequences_.size() );
135  return sequences_.begin() + cache_;
136  }
137 
138  // If not cached, do a normal lookup, and set the cache.
139  auto lit = lookup_.find( label );
140  if( lit == lookup_.end() ) {
141  return sequences_.end();
142  }
143  assert( lit->first == label );
144  assert( lit->second < sequences_.size() );
145  cache_ = lit->second;
146  return sequences_.begin() + lit->second;
147  }
148 
152  inline const_reference get( std::string const& label ) const
153  {
154  auto const it = find( label );
155  if( it == sequences_.end() ) {
156  throw std::runtime_error(
157  "Reference Genome does not contain requested sequence \"" + label + "\""
158  );
159  }
160  return *it;
161  }
162 
174  inline char get_base( std::string const& chromosome, size_t position, bool to_upper = true ) const
175  {
176  return get_base( find( chromosome ), position, to_upper );
177  }
178 
185  inline char get_base( const_iterator it, size_t position, bool to_upper = true ) const
186  {
187  // We assume that the given iterator is valid. Everything else is a user error.
188  assert( it != sequences_.end() );
189 
190  // Get the base at the given position.
191  auto const& ref_seq = *it;
192  if( position == 0 || position - 1 >= ref_seq.length() ) {
193  throw std::runtime_error(
194  "Reference Genome sequence \"" + ref_seq.label() + "\" is " +
195  std::to_string( ref_seq.length() ) +
196  " bases long, which is shorter than then requested (1-based) position " +
197  std::to_string( position ) + ". This likely means that some input data contains " +
198  "positions beyond the length of the reference genome, which is invalid."
199  );
200  }
201  assert( position - 1 < ref_seq.length() );
202  if( to_upper ) {
203  return utils::to_upper( ref_seq[ position - 1 ]);
204  }
205  return ref_seq[ position - 1 ];
206  }
207 
208  // -------------------------------------------------------------------------
209  // Modifiers
210  // -------------------------------------------------------------------------
211 
222  const_reference add( Sequence const& seq, bool also_look_up_first_word = true )
223  {
224  return add( Sequence{seq}, also_look_up_first_word );
225  }
226 
232  const_reference add( Sequence&& seq, bool also_look_up_first_word = true )
233  {
234  // Get and check the original form of the label.
235  auto const label1 = seq.label();
236  if( lookup_.count( label1 ) > 0 ) {
237  throw std::runtime_error(
238  "Reference Genome already contains sequence name \"" + label1 + "\", "
239  "which cannot be added again."
240  );
241  }
242  assert( lookup_.count( label1 ) == 0 );
243 
244  // Do the same for the first-word form as well. We always compute the label here,
245  // even if not used later, so that we can do the check before actually modifying our content.
246  // Slightly cleaner that way.
247  auto const label2 = utils::split( seq.label(), "\t " )[0];
248  if( also_look_up_first_word && lookup_.count( label2 ) > 0 ) {
249  throw std::runtime_error(
250  "Reference Genome already contains sequence name \"" + label2 + "\", "
251  "which is the shortened version of the original name \"" + label1 + "\"."
252  );
253  }
254 
255  // Lock access to the cache. Probably not needed here, as adding sequences from
256  // mutliple threads is unlikely, but doesn't hurt to have it here.
257  // Released at the end of the scope.
258  assert( guard_ );
259  std::lock_guard<std::mutex> lock( *guard_ );
260 
261  // Add the sequence to the list. We also need to reset the cache.
262  sequences_.push_back( std::move(seq) );
263  cache_ = std::numeric_limits<size_t>::max();
264 
265  // Also add the sequence name to the lookup. If we also add a first-word-only version of it,
266  // we might have cases where this is the same as the original (when the name does not
267  // contain any tabs or spaces), but that doesn't matter; we'd just add the same label
268  // twice (which would overwrite it in the map), pointing to the same sequence either way.
269  assert( sequences_.size() > 0 );
270  lookup_[ label1 ] = sequences_.size() - 1;
271  assert( lookup_.count( label1 ) > 0 );
272  if( also_look_up_first_word ) {
273  lookup_[ label2 ] = sequences_.size() - 1;
274  assert( lookup_.count( label2 ) > 0 );
275  }
276 
277  // Now return the sequence that was just added.
278  return sequences_.back();
279  }
280 
284  void clear()
285  {
286  assert( guard_ );
287  std::lock_guard<std::mutex> lock( *guard_ );
288  sequences_.clear();
289  lookup_.clear();
290  cache_ = std::numeric_limits<size_t>::max();
291  }
292 
293  // -------------------------------------------------------------------------
294  // Iterators
295  // -------------------------------------------------------------------------
296 
298  {
299  return sequences_.cbegin();
300  }
301 
303  {
304  return sequences_.cend();
305  }
306 
308  {
309  return sequences_.cbegin();
310  }
311 
313  {
314  return sequences_.cend();
315  }
316 
317  // -------------------------------------------------------------------------
318  // Data Members
319  // -------------------------------------------------------------------------
320 
321 private:
322 
323  // Keep the sequences, as well as a lookup from names to indices in the vector.
324  std::vector<Sequence> sequences_;
325  std::unordered_map<std::string, size_t> lookup_;
326 
327  // We keep a cache of the last sequence name that was requested,
328  // for speeding up lookups on the same chromosome, which is the most typical case.
329  // Needs to be mutexed, as otherwise multiple threads might clash when accessing the cache.
330  mutable size_t cache_;
331  mutable std::unique_ptr<std::mutex> guard_;
332 
333 };
334 
335 } // namespace sequence
336 } // namespace genesis
337 
338 #endif // include guard
genesis::sequence::ReferenceGenome::find
const_iterator find(std::string const &label) const
Return an iterator to the Sequence with the given label, or an iterator to end() if no Sequence with ...
Definition: reference_genome.hpp:125
genesis::sequence::ReferenceGenome::add
const_reference add(Sequence const &seq, bool also_look_up_first_word=true)
Add a Sequence to the ReferenceGenome by copying it, and return a const_reference to it.
Definition: reference_genome.hpp:222
genesis::sequence::ReferenceGenome::get_base
char get_base(std::string const &chromosome, size_t position, bool to_upper=true) const
Get a particular base at a given chromosome and position.
Definition: reference_genome.hpp:174
genesis::sequence::ReferenceGenome::add
const_reference add(Sequence &&seq, bool also_look_up_first_word=true)
Add a Sequence to the ReferenceGenome by moving it, and return a const_reference to it.
Definition: reference_genome.hpp:232
genesis::sequence::ReferenceGenome::~ReferenceGenome
~ReferenceGenome()=default
genesis::sequence::ReferenceGenome::iterator
typename std::vector< Sequence >::iterator iterator
Definition: reference_genome.hpp:73
genesis::sequence::ReferenceGenome::cbegin
const_iterator cbegin() const
Definition: reference_genome.hpp:307
genesis::sequence::ReferenceGenome::const_iterator
typename std::vector< Sequence >::const_iterator const_iterator
Definition: reference_genome.hpp:74
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
genesis::sequence::ReferenceGenome::const_reference
Sequence const & const_reference
Definition: reference_genome.hpp:77
std.hpp
Provides some valuable additions to STD.
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::sequence::ReferenceGenome::contains
bool contains(std::string const &label) const
Definition: reference_genome.hpp:116
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
genesis::utils::to_upper
constexpr char to_upper(char c) noexcept
Return the upper case version of a letter, ASCII-only.
Definition: char.hpp:230
string.hpp
Provides some commonly used string utility functions.
genesis::sequence::ReferenceGenome::operator=
ReferenceGenome & operator=(ReferenceGenome const &)=delete
genesis::sequence::ReferenceGenome::end
const_iterator end() const
Definition: reference_genome.hpp:302
genesis::sequence::ReferenceGenome::begin
const_iterator begin() const
Definition: reference_genome.hpp:297
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::ReferenceGenome::ReferenceGenome
ReferenceGenome()
Definition: reference_genome.hpp:83
char.hpp
genesis::sequence::ReferenceGenome::get_base
char get_base(const_iterator it, size_t position, bool to_upper=true) const
Get a particular base at the given sequence iterator and position.
Definition: reference_genome.hpp:185
genesis::sequence::ReferenceGenome::size
size_t size() const
Definition: reference_genome.hpp:103
genesis::sequence::ReferenceGenome::cend
const_iterator cend() const
Definition: reference_genome.hpp:312
genesis::sequence::ReferenceGenome::clear
void clear()
Remove all Sequences from the ReferenceGenome, leaving it with a size() of 0.
Definition: reference_genome.hpp:284
genesis::sequence::ReferenceGenome
Lookup of Sequences of a reference genome.
Definition: reference_genome.hpp:65
genesis::sequence::ReferenceGenome::empty
bool empty() const
Definition: reference_genome.hpp:111
genesis::sequence::ReferenceGenome::get
const_reference get(std::string const &label) const
Same as find(), but returns the sequence directly, or throws if not present.
Definition: reference_genome.hpp:152
sequence.hpp