A library for working with phylogenetic and population genetic data.
v0.32.0
dict.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
37 
38 #include <cassert>
39 #include <stdexcept>
40 #include <string>
41 #include <unordered_set>
42 #include <utility>
43 
44 namespace genesis {
45 namespace sequence {
46 
47 // =================================================================================================
48 // Sequence Dict
49 // =================================================================================================
50 
51 SequenceDict read_sequence_dict( std::shared_ptr< utils::BaseInputSource > source )
52 {
53  // Prepare the result and the input stream.
54  SequenceDict result;
55  utils::InputStream it( source );
56 
57  // Read lines while there is data.
58  // We don't need to be super efficient here, dict files typically only contain a few dozen lines.
59  size_t line_cnt = 0;
60  while( it ) {
61  ++line_cnt;
62 
63  // Get the line and split it on tabs; do some basic format sanity checks.
64  auto const line = utils::split( it.get_line(), "\t" );
65  if( line.size() == 0 ) {
66  continue;
67  }
68  if( line[0].length() != 3 || line[0][0] != '@' ) {
69  throw std::runtime_error(
70  "Invalid sequence dict file: Line " + std::to_string( line_cnt ) +
71  " does not start with a header record type code '@XX', but with '" + line[0] + "'."
72  );
73  }
74 
75  // Now we know that we have a valid line. Let's see if it's one that we are intersted in.
76  assert( line.size() > 0 );
77  assert( line[0].size() == 3 );
78  assert( line[0][0] == '@' );
79  if( line[0] != "@SQ" ) {
80  continue;
81  }
82 
83  // Now we can go through the fields of the line, and find the TAG:VALUE pairs that we want.
84  std::string sn;
85  size_t ln = 0;
86  for( size_t i = 1; i < line.size(); ++i ) {
87  auto const& field = line[i];
88  if( field.size() < 3 || field[2] != ':' ) {
89  throw std::runtime_error(
90  "Invalid sequence dict file: Line " + std::to_string( line_cnt ) +
91  " contains an @SQ record that is not of the form 'TAG:VALUE', "
92  "but '" + field + "'."
93  );
94  }
95  assert( field.size() >=3 );
96  assert( field[2] == ':' );
97 
98  // Get the two fields we are itnerested in.
99  if( utils::starts_with( field, "SN:" )) {
100  sn = field.substr( 3 );
101  }
102  if( utils::starts_with( field, "LN:" )) {
103  if( ! utils::is_convertible_to_unsigned_integer( field.substr( 3 ) )) {
104  throw std::runtime_error(
105  "Invalid sequence dict file: Line " + std::to_string( line_cnt ) +
106  " contains an @SQ record with a field for the sequence length LN"
107  " whose VALUE is not a number, but '" + field.substr( 3 ) + "'."
108  );
109  }
110  ln = utils::convert_to_unsigned_integer( field.substr( 3 ));
111  }
112  }
113 
114  // We are a bit pedantic here, and throw in case of empty results.
115  // Technically, those could actually occur in the data, but that would be some weird edge
116  // case anyway, that would lead to other problems downstream.
117  if( sn.empty() || ln == 0 ) {
118  throw std::runtime_error(
119  "Invalid sequence dict file: Line " + std::to_string( line_cnt ) +
120  " contains an @SQ record with no valid SN or LN fields."
121  );
122  }
123 
124  // Now we are good; add to the list.
125  result.add( sn, ln );
126  }
127 
128  return result;
129 }
130 
131 SequenceDict read_sequence_fai( std::shared_ptr<utils::BaseInputSource> source )
132 {
133  // Prepare the result and the input stream.
134  SequenceDict result;
135  utils::InputStream it( source );
136 
137  // Read lines while there is data.
138  // We don't need to be super efficient here, fai files typically only contain a few dozen lines.
139  size_t line_cnt = 0;
140  while( it ) {
141  ++line_cnt;
142 
143  // Get the line and split it on tabs; do some basic format sanity checks.
144  auto const line = utils::split( it.get_line(), "\t" );
145  if( line.size() == 0 ) {
146  continue;
147  }
148  if( line.size() != 5 ) {
149  throw std::runtime_error(
150  "Invalid sequence fai file: Line " + std::to_string( line_cnt ) +
151  " has " + std::to_string( line.size() ) + " columns instead of the expected 5 columns."
152  );
153  }
154 
155  // Now we know that we have a valid line. Let's see if it's one that we are intersted in.
156  assert( line.size() == 5 );
157  std::string const sn = line[0];
159  throw std::runtime_error(
160  "Invalid sequence fai file: Line " + std::to_string( line_cnt ) +
161  " contains a record with a LENGTH field that is not a number, but '" + line[1] + "'."
162  );
163  }
164  size_t const ln = utils::convert_to_unsigned_integer( line[1] );
165 
166  // We are a bit pedantic here, and throw in case of empty results.
167  // Technically, those could actually occur in the data, but that would be some weird edge
168  // case anyway, that would lead to other problems downstream.
169  if( sn.empty() || ln == 0 ) {
170  throw std::runtime_error(
171  "Invalid sequence fai file: Line " + std::to_string( line_cnt ) +
172  " contains a record with invalid NAME or LENGTH fields."
173  );
174  }
175 
176  // Now we are good; add to the list.
177  result.add( sn, ln );
178  }
179 
180  return result;
181 }
182 
183 template<class T>
185 {
186  SequenceDict result;
187  for( auto const& elem : input ) {
188  result.add( elem.label(), elem.length() );
189  }
190  return result;
191 }
192 
194 {
195  return sequence_iterable_to_dict_( set );
196 }
197 
199 {
200  return sequence_iterable_to_dict_( rg );
201 }
202 
203 // =================================================================================================
204 // Reference
205 // =================================================================================================
206 
208  SequenceDict const& lhs,
209  SequenceDict const& rhs,
211 ) {
212  // Basic check for forbidden edge case of empty strings first.
213  auto check_dict_empty_names_ = []( SequenceDict const& dict ) {
214  for( auto const& e : dict ) {
215  if( e.name.empty() ) {
216  throw std::invalid_argument( "Invalid reference sequences with empty names." );
217  }
218  }
219  };
220  check_dict_empty_names_( lhs );
221  check_dict_empty_names_( rhs );
222 
223  // We have a few different combinations to test. Let's see if that works.
224  switch( mode ) {
226  // Strict: Compar the sizes of both sets. Then, we only need to iterate one of them,
227  // as this then needs to exactly match every entry in the other.
228  if( lhs.size() != rhs.size() ) {
229  return false;
230  }
231  for( auto const& e : lhs ) {
232  if( ! rhs.contains( e.name ) || e.length != rhs.get( e.name ).length ) {
233  return false;
234  }
235  }
236  break;
237  }
239  // Left superset is equivalent to right subset, so we only need to go through everything
240  // that the rhs has and compare those.
241  for( auto const& e : rhs ) {
242  if( ! lhs.contains( e.name ) || e.length != lhs.get( e.name ).length ) {
243  return false;
244  }
245  }
246  break;
247  }
249  // Same the other way round. Bit of code duplication, but okay for now.
250  for( auto const& e : lhs ) {
251  if( ! rhs.contains( e.name ) || e.length != rhs.get( e.name ).length ) {
252  return false;
253  }
254  }
255  break;
256  }
258  // Shared only: We ignore the ones that are not in either.
259  for( auto const& e : lhs ) {
260  if( ! rhs.contains( e.name )) {
261  continue;
262  }
263  if( e.length != rhs.get( e.name ).length ) {
264  return false;
265  }
266  }
267  break;
268  }
269  default: {
270  throw std::invalid_argument(
271  "Invalid ReferenceComparisonMode in compatible_references()"
272  );
273  }
274  }
275 
276  // Finally, we have exhausted all false cases above, so we can return that they are compatible.
277  return true;
278 }
279 
280 bool verify( SequenceDict const& dict, SequenceSet const& set, bool match_first_word )
281 {
282  if( dict.size() != set.size() ) {
283  return false;
284  }
285  for( size_t i = 0; i < dict.size(); ++i ) {
286  if( dict[i].name.empty() || set[i].label().empty() ) {
287  return false;
288  }
289  if( match_first_word ) {
290  auto const s_dct = utils::split( dict[i].name, "\t " );
291  auto const s_set = utils::split( set[i].label(), "\t " );
292  assert( ! s_dct.empty() );
293  assert( ! s_set.empty() );
294  if( s_dct[0] != s_set[0] ) {
295  return false;
296  }
297  } else if( dict[i].name != set[i].label() || dict[i].length != set[i].length() ) {
298  return false;
299  }
300  }
301  return true;
302 }
303 
304 } // namespace sequence
305 } // namespace genesis
genesis::utils::InputStream
Stream interface for reading data from an InputSource, that keeps track of line and column counters.
Definition: input_stream.hpp:88
genesis::sequence::sequence_iterable_to_dict_
SequenceDict sequence_iterable_to_dict_(T const &input)
Definition: dict.cpp:184
genesis::sequence::SequenceDict::add
void add(Sequence const &sequence, bool also_look_up_first_word=true)
Add a Sequence to the dictionary.
Definition: sequence_dict.hpp:133
genesis::sequence::SequenceDict::Entry::length
size_t length
Definition: sequence_dict.hpp:74
genesis::sequence::ReferenceComparisonMode::kSharedOnly
@ kSharedOnly
Either reference set can contain sequences that are not in the other. Only the shared ones are used f...
genesis::sequence::SequenceDict::contains
bool contains(std::string const &name) const
Definition: sequence_dict.hpp:252
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::sequence::ReferenceComparisonMode::kLeftSuperset
@ kLeftSuperset
The left hand reference set is allowed to contain sequences that are not in the right hand side....
genesis::utils::is_convertible_to_unsigned_integer
bool is_convertible_to_unsigned_integer(std::string const &str)
Return whether a string can be converted to unsigned integer.
Definition: convert.cpp:236
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
genesis::sequence::verify
bool verify(SequenceDict const &dict, SequenceSet const &set, bool match_first_word)
Verify that a SequenceDict fits a SequenceSet.
Definition: dict.cpp:280
genesis::sequence::sequence_set_to_dict
SequenceDict sequence_set_to_dict(SequenceSet const &set)
Get the sequence dict/index information of a given set of Sequences.
Definition: dict.cpp:193
genesis::utils::starts_with
bool starts_with(std::string const &text, std::string const &prefix)
Return whether a string starts with another string, i.e., check for a prefix.
Definition: string.cpp:136
genesis::sequence::SequenceDict::size
size_t size() const
Definition: sequence_dict.hpp:218
genesis::sequence::reference_genome_to_dict
SequenceDict reference_genome_to_dict(ReferenceGenome const &rg)
Get the sequence dict/index information of a given set of Sequences that are stored in a ReferenceGen...
Definition: dict.cpp:198
genesis::sequence::SequenceSet::size
size_t size() const
Definition: sequence_set.hpp:93
genesis::utils::split
std::vector< std::string > split(std::string const &str, char delimiter, const bool trim_empty)
Spilt a string into parts, given a delimiter char.
Definition: string.cpp:575
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
string.hpp
Provides some commonly used string utility functions.
genesis::utils::convert_to_unsigned_integer
bool convert_to_unsigned_integer(std::string const &str, unsigned long long &result)
Convert a string to unsigned integer, store the result in result, and return whether the conversion a...
Definition: convert.cpp:214
input_stream.hpp
genesis::sequence::compatible_references
bool compatible_references(SequenceDict const &lhs, SequenceDict const &rhs, ReferenceComparisonMode mode)
Verify that a SequenceDict fits a SequenceSet.
Definition: dict.cpp:207
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::InputStream::get_line
void get_line(std::string &target)
Read the current line, append it to the target, and move to the beginning of the next line.
Definition: input_stream.cpp:127
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:53
genesis::sequence::ReferenceComparisonMode
ReferenceComparisonMode
Chose how to deal with sub-/super-sets when comparing references.
Definition: dict.hpp:114
convert.hpp
genesis::sequence::ReferenceComparisonMode::kStrict
@ kStrict
Both compared reference sets have to contain the exact same sequence names.
char.hpp
genesis::sequence::read_sequence_dict
SequenceDict read_sequence_dict(std::shared_ptr< utils::BaseInputSource > source)
Read a .dict sequence dictionary file, describing, e.g., reference genome sequence properties.
Definition: dict.cpp:51
genesis::sequence::SequenceSet::empty
bool empty() const
Definition: sequence_set.hpp:101
dict.hpp
genesis::sequence::SequenceDict::get
Entry const & get(std::string const &name) const
Definition: sequence_dict.hpp:233
genesis::sequence::read_sequence_fai
SequenceDict read_sequence_fai(std::shared_ptr< utils::BaseInputSource > source)
Read a .fai sequence index file, describing, e.g., reference genome sequence properties.
Definition: dict.cpp:131
genesis::sequence::ReferenceGenome
Lookup of Sequences of a reference genome.
Definition: reference_genome.hpp:65
genesis::sequence::ReferenceComparisonMode::kRightSuperset
@ kRightSuperset
The right hand reference set is allowed to contain sequences that are not in the lefthand side....