A library for working with phylogenetic and population genetic data.
v0.32.0
function/genome_locus.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_POPULATION_FUNCTION_GENOME_LOCUS_H_
2 #define GENESIS_POPULATION_FUNCTION_GENOME_LOCUS_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@sund.ku.dk>
23  University of Copenhagen, Globe Institute, Section for GeoGenetics
24  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
25 */
26 
37 
38 #include <cassert>
39 #include <cstring>
40 #include <iostream>
41 #include <memory>
42 #include <string>
43 #include <stdexcept>
44 
45 namespace genesis {
46 namespace population {
47 
48 // =================================================================================================
49 // Output
50 // =================================================================================================
51 
52 inline std::string to_string( GenomeLocus const& locus )
53 {
54  // Error case.
55  if( locus.chromosome.empty() ) {
56  throw std::invalid_argument( "Invalid GenomeLocus with empty chromosome." );
57  }
58 
59  // Special case.
60  if( locus.position == 0 ) {
61  return locus.chromosome;
62  }
63 
64  // General case.
65  return locus.chromosome + ":" + std::to_string( locus.position );
66 }
67 
68 inline std::ostream& operator << ( std::ostream& os, GenomeLocus const& locus )
69 {
70  os << to_string( locus );
71  return os;
72 }
73 
74 // =================================================================================================
75 // Comparion Operators
76 // =================================================================================================
77 
78 // Really good example of the claim that C++ needs a lot of boilerplate code...
79 // We here provide all comparison operators for GenomeLocus, but also comparing loci given
80 // as chromosome (`std::string`) and position (`size_t`) separately,
81 // and then also overload all of those for using references and shared pointers to a SequenceDict.
82 //
83 // Previously (before also offering overloads that take SequenceDict), we just spelled out all
84 // overloads manually, but that became too cumbersome with the extra overloads. So now we use a
85 // macro that defines all of them for us. It is not super nice for the doxygen code documentation,
86 // as the doxygen preprocessor does not fully seem to be able to expand the macro. Hence, we cannot
87 // use copydoc to refer to specific functions, and have re-written the doc blocks of the main
88 // functions (the ones that the macro-expanded functions call) so that they work for all overloads.
89 
90 #define ADD_LOCUS_COMPARISON_OVERLOADS( CMP_FUNC ) \
91  \
92  inline int CMP_FUNC( \
93  GenomeLocus const& l, \
94  std::string const& r_chromosome, size_t r_position \
95  ) { \
96  return CMP_FUNC( l.chromosome, l.position, r_chromosome, r_position ); \
97  } \
98  \
99  \
100  inline int CMP_FUNC( \
101  std::string const& l_chromosome, size_t l_position, \
102  GenomeLocus const& r \
103  ) { \
104  return CMP_FUNC( l_chromosome, l_position, r.chromosome, r.position ); \
105  } \
106  \
107  \
108  inline int CMP_FUNC( \
109  GenomeLocus const& l, \
110  GenomeLocus const& r \
111  ) { \
112  return CMP_FUNC( l.chromosome, l.position, r.chromosome, r.position ); \
113  } \
114  \
115  \
116  inline int CMP_FUNC( \
117  GenomeLocus const& l, \
118  std::string const& r_chromosome, size_t r_position, \
119  genesis::sequence::SequenceDict const& sequence_dict \
120  ) { \
121  return CMP_FUNC( l.chromosome, l.position, r_chromosome, r_position, sequence_dict ); \
122  } \
123  \
124  \
125  inline int CMP_FUNC( \
126  std::string const& l_chromosome, size_t l_position, \
127  GenomeLocus const& r, \
128  genesis::sequence::SequenceDict const& sequence_dict \
129  ) { \
130  return CMP_FUNC( l_chromosome, l_position, r.chromosome, r.position, sequence_dict ); \
131  } \
132  \
133  \
134  inline int CMP_FUNC( \
135  GenomeLocus const& l, \
136  GenomeLocus const& r, \
137  genesis::sequence::SequenceDict const& sequence_dict \
138  ) { \
139  return CMP_FUNC( l.chromosome, l.position, r.chromosome, r.position, sequence_dict ); \
140  } \
141  \
142  \
143  inline int CMP_FUNC( \
144  std::string const& l_chromosome, size_t l_position, \
145  std::string const& r_chromosome, size_t r_position, \
146  std::shared_ptr<genesis::sequence::SequenceDict> const& sequence_dict \
147  ) { \
148  if( sequence_dict ) { \
149  return CMP_FUNC( l_chromosome, l_position, r_chromosome, r_position, *sequence_dict ); \
150  } else { \
151  return CMP_FUNC( l_chromosome, l_position, r_chromosome, r_position ); \
152  } \
153  } \
154  \
155  \
156  inline int CMP_FUNC( \
157  GenomeLocus const& l, \
158  std::string const& r_chromosome, size_t r_position, \
159  std::shared_ptr<genesis::sequence::SequenceDict> const& sequence_dict \
160  ) { \
161  if( sequence_dict ) { \
162  return CMP_FUNC( l.chromosome, l.position, r_chromosome, r_position, *sequence_dict ); \
163  } else { \
164  return CMP_FUNC( l.chromosome, l.position, r_chromosome, r_position ); \
165  } \
166  } \
167  \
168  \
169  inline int CMP_FUNC( \
170  std::string const& l_chromosome, size_t l_position, \
171  GenomeLocus const& r, \
172  std::shared_ptr<genesis::sequence::SequenceDict> const& sequence_dict \
173  ) { \
174  if( sequence_dict ) { \
175  return CMP_FUNC( l_chromosome, l_position, r.chromosome, r.position, *sequence_dict ); \
176  } else { \
177  return CMP_FUNC( l_chromosome, l_position, r.chromosome, r.position ); \
178  } \
179  } \
180  \
181  \
182  inline int CMP_FUNC( \
183  GenomeLocus const& l, \
184  GenomeLocus const& r, \
185  std::shared_ptr<genesis::sequence::SequenceDict> const& sequence_dict \
186  ) { \
187  if( sequence_dict ) { \
188  return CMP_FUNC( l.chromosome, l.position, r.chromosome, r.position, *sequence_dict ); \
189  } else { \
190  return CMP_FUNC( l.chromosome, l.position, r.chromosome, r.position ); \
191  } \
192  }
193 
194 // -------------------------------------------------------------------------
195 // Spaceship <=>
196 // -------------------------------------------------------------------------
197 
232 inline int locus_compare(
233  std::string const& l_chromosome, size_t l_position,
234  std::string const& r_chromosome, size_t r_position
235 ) {
236  // We compare the chromosomes with strcmp, so that we only run the relatively expensive string
237  // comparison once, and then check the result - if equal, do the threeway position comparison.
238  auto const chr_cmp = std::strcmp( l_chromosome.c_str(), r_chromosome.c_str() );
239  if( chr_cmp != 0 ) {
240  return chr_cmp;
241  }
242  assert( l_chromosome == r_chromosome );
243  return utils::compare_threeway( l_position, r_position );
244 }
245 
249 inline int locus_compare(
250  std::string const& l_chromosome, size_t l_position,
251  std::string const& r_chromosome, size_t r_position,
252  ::genesis::sequence::SequenceDict const& sequence_dict
253 ) {
254  // Here, we want to compare chromosome names order based on the given dict.
255  // However, string lookup is a bit expensive, so we first do a quick check for equality,
256  // and only if they are not equal, we get their indices, and compare those.
257  if( l_chromosome == r_chromosome ) {
258  // For identical chromosomes, we compare the positions.
259  return utils::compare_threeway( l_position, r_position );
260  }
261 
262  // Here, we know the chromosomes are different, so we compare their indices.
263  // No need to compare the positions here again. We assert that they are indeed different.
264  auto const l_chr_idx = sequence_dict.index_of( l_chromosome );
265  auto const r_chr_idx = sequence_dict.index_of( r_chromosome );
266  auto const chr_cmp = utils::compare_threeway( l_chr_idx, r_chr_idx );
267  assert( chr_cmp != 0 );
268  return chr_cmp;
269 }
270 
271 // Add all other overloads for GenomeLocus and SequenceDict combinations.
273 
274 // Let's be cool and add the actual spaceship, even if genesis is currently using C++11.
275 // Users might compile with later versions, so this might be useful to have.
276 
277 #if __cplusplus >= 202002L
278 
282 inline int operator <=> ( GenomeLocus const& l, GenomeLocus const& r )
283 {
284  return locus_compare( l.chromosome, l.position, r.chromosome, r.position );
285 }
286 
287 #endif // __cplusplus >= 202002L
288 
289 // -------------------------------------------------------------------------
290 // Equality ==
291 // -------------------------------------------------------------------------
292 
300 inline bool locus_equal(
301  std::string const& l_chromosome, size_t l_position,
302  std::string const& r_chromosome, size_t r_position
303 ) {
304  return l_chromosome == r_chromosome && l_position == r_position;
305 }
306 
310 inline bool locus_equal(
311  GenomeLocus const& l,
312  std::string const& r_chromosome, size_t r_position
313 ) {
314  return locus_equal( l.chromosome, l.position, r_chromosome, r_position );
315 }
316 
320 inline bool locus_equal(
321  std::string const& l_chromosome, size_t l_position,
322  GenomeLocus const& r
323 ) {
324  return locus_equal( l_chromosome, l_position, r.chromosome, r.position );
325 }
326 
330 inline bool locus_equal(
331  GenomeLocus const& l,
332  GenomeLocus const& r
333 ) {
334  return locus_equal( l.chromosome, l.position, r.chromosome, r.position );
335 }
336 
340 inline bool operator == ( GenomeLocus const& l, GenomeLocus const& r )
341 {
342  return locus_equal( l.chromosome, l.position, r.chromosome, r.position );
343 }
344 
345 // -------------------------------------------------------------------------
346 // Inequality !=
347 // -------------------------------------------------------------------------
348 
356 inline bool locus_inequal(
357  std::string const& l_chromosome, size_t l_position,
358  std::string const& r_chromosome, size_t r_position
359 ) {
360  return ! locus_equal( l_chromosome, l_position, r_chromosome, r_position );
361 }
362 
366 inline bool locus_inequal(
367  GenomeLocus const& l,
368  std::string const& r_chromosome, size_t r_position
369 ) {
370  return locus_inequal( l.chromosome, l.position, r_chromosome, r_position );
371 }
372 
376 inline bool locus_inequal(
377  std::string const& l_chromosome, size_t l_position,
378  GenomeLocus const& r
379 ) {
380  return locus_inequal( l_chromosome, l_position, r.chromosome, r.position );
381 }
382 
386 inline bool locus_inequal(
387  GenomeLocus const& l,
388  GenomeLocus const& r
389 ) {
390  return locus_inequal( l.chromosome, l.position, r.chromosome, r.position );
391 }
392 
396 inline bool operator != ( GenomeLocus const& l, GenomeLocus const& r )
397 {
398  return locus_inequal( l.chromosome, l.position, r.chromosome, r.position );
399 }
400 
401 // -------------------------------------------------------------------------
402 // Less than <
403 // -------------------------------------------------------------------------
404 
414 inline bool locus_less(
415  std::string const& l_chromosome, size_t l_position,
416  std::string const& r_chromosome, size_t r_position
417 ) {
418  return l_chromosome < r_chromosome || ( l_chromosome == r_chromosome && l_position < r_position );
419 }
420 
424 inline bool locus_less(
425  std::string const& l_chromosome, size_t l_position,
426  std::string const& r_chromosome, size_t r_position,
427  ::genesis::sequence::SequenceDict const& sequence_dict
428 ) {
429  // Same logic as above, but using chromosome indices in the dict, instead of their names.
430  // We also apply the speedup of the locus_compare(), by first checking the strings for
431  // equality, before doing the expensive index lookup in the dict.
432  if( l_chromosome == r_chromosome ) {
433  // For identical chromosomes, we compare the positions.
434  return l_position < r_position;
435  }
436 
437  // Here, we know the chromosomes are different, so we compare their indices.
438  // No need to compare the positions here again. We assert that they are indeed different.
439  auto const l_chr_idx = sequence_dict.index_of( l_chromosome );
440  auto const r_chr_idx = sequence_dict.index_of( r_chromosome );
441  assert( l_chr_idx != r_chr_idx );
442  return l_chr_idx < r_chr_idx;
443 }
444 
445 // Add all other overloads for GenomeLocus and SequenceDict combinations.
447 
448 
451 inline bool operator < ( GenomeLocus const& l, GenomeLocus const& r )
452 {
453  return locus_less( l.chromosome, l.position, r.chromosome, r.position );
454 }
455 
456 // -------------------------------------------------------------------------
457 // Greater than >
458 // -------------------------------------------------------------------------
459 
465 inline bool locus_greater(
466  std::string const& l_chromosome, size_t l_position,
467  std::string const& r_chromosome, size_t r_position
468 ) {
469  // Just use the existing function, but with reverse l and r.
470  return locus_less( r_chromosome, r_position, l_chromosome, l_position );
471 }
472 
476 inline bool locus_greater(
477  std::string const& l_chromosome, size_t l_position,
478  std::string const& r_chromosome, size_t r_position,
479  ::genesis::sequence::SequenceDict const& sequence_dict
480 ) {
481  // Just use the existing function, but with reverse l and r.
482  return locus_less( r_chromosome, r_position, l_chromosome, l_position, sequence_dict );
483 }
484 
485 // Add all other overloads for GenomeLocus and SequenceDict combinations.
487 
488 
491 inline bool operator > ( GenomeLocus const& l, GenomeLocus const& r )
492 {
493  return locus_greater( l.chromosome, l.position, r.chromosome, r.position );
494 }
495 
496 // -------------------------------------------------------------------------
497 // Less than or equal <=
498 // -------------------------------------------------------------------------
499 
506  std::string const& l_chromosome, size_t l_position,
507  std::string const& r_chromosome, size_t r_position
508 ) {
509  // We could do the simple default way of implementing this as `a == b || a < b`,
510  // but this seems wasteful; in this case, we can do with fewer comparisons!
511  return l_chromosome < r_chromosome || ( l_chromosome == r_chromosome && l_position <= r_position );
512 }
513 
518  std::string const& l_chromosome, size_t l_position,
519  std::string const& r_chromosome, size_t r_position,
520  ::genesis::sequence::SequenceDict const& sequence_dict
521 ) {
522  // Same logic as in locus_less. See there for details.
523  if( l_chromosome == r_chromosome ) {
524  return l_position <= r_position;
525  }
526  auto const l_chr_idx = sequence_dict.index_of( l_chromosome );
527  auto const r_chr_idx = sequence_dict.index_of( r_chromosome );
528  assert( l_chr_idx != r_chr_idx );
529  return l_chr_idx < r_chr_idx;
530 }
531 
532 // Add all other overloads for GenomeLocus and SequenceDict combinations.
534 
535 
538 inline bool operator <= ( GenomeLocus const& l, GenomeLocus const& r )
539 {
540  return locus_less_or_equal( l.chromosome, l.position, r.chromosome, r.position );
541 }
542 
543 // -------------------------------------------------------------------------
544 // Greater than or equal >=
545 // -------------------------------------------------------------------------
546 
553  std::string const& l_chromosome, size_t l_position,
554  std::string const& r_chromosome, size_t r_position
555 ) {
556  // Just use the existing function, but with reverse l and r.
557  return locus_less_or_equal( r_chromosome, r_position, l_chromosome, l_position );
558 }
559 
564  std::string const& l_chromosome, size_t l_position,
565  std::string const& r_chromosome, size_t r_position,
566  ::genesis::sequence::SequenceDict const& sequence_dict
567 ) {
568  // Just use the existing function, but with reverse l and r.
569  return locus_less_or_equal( r_chromosome, r_position, l_chromosome, l_position, sequence_dict );
570 }
571 
572 // Add all other overloads for GenomeLocus and SequenceDict combinations.
574 
575 
578 inline bool operator >= ( GenomeLocus const& l, GenomeLocus const& r )
579 {
580  return locus_greater_or_equal( l.chromosome, l.position, r.chromosome, r.position );
581 }
582 
583 // -------------------------------------------------------------------------
584 // Clean Up
585 // -------------------------------------------------------------------------
586 
587 // We do not need to propagate this macro throughout our code base.
588 #undef ADD_LOCUS_COMPARISON_OVERLOADS
589 
590 } // namespace population
591 } // namespace genesis
592 
593 #endif // include guard
genesis::population::locus_equal
bool locus_equal(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Equality comparison (==) for two loci in a genome.
Definition: function/genome_locus.hpp:300
genesis::population::operator!=
bool operator!=(GenomeLocus const &l, GenomeLocus const &r)
Inequality comparison (!=) for two loci in a genome.
Definition: function/genome_locus.hpp:396
genesis::utils::compare_threeway
constexpr int compare_threeway(T lhs, U rhs)
Three-way comparison (spaceship operator) for C++ <20.
Definition: common.hpp:145
genesis::population::operator<<
std::ostream & operator<<(std::ostream &os, SampleCounts const &bs)
Output stream operator for SampleCounts instances.
Definition: population/function/functions.cpp:649
genesis::sequence::SequenceDict
Store dictionary/index data on sequence files, such as coming from .fai or .dict files.
Definition: sequence_dict.hpp:63
genesis::population::GenomeLocus::position
size_t position
Definition: genome_locus.hpp:67
common.hpp
genome_locus.hpp
genesis::population::locus_greater
bool locus_greater(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Greater than comparison (>) for two loci in a genome.
Definition: function/genome_locus.hpp:465
genesis::population::locus_compare
int locus_compare(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Three-way comparison (spaceship operator <=>) for two loci in a genome.
Definition: function/genome_locus.hpp:232
genesis::population::operator==
bool operator==(GenomeLocus const &l, GenomeLocus const &r)
Equality comparison (==) for two loci in a genome.
Definition: function/genome_locus.hpp:340
genesis::population::GenomeLocus
A single locus, that is, a position (or coordinate) on a chromosome.
Definition: genome_locus.hpp:64
genesis::population::locus_greater_or_equal
bool locus_greater_or_equal(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Greater than or equal comparison (>=) for two loci in a genome.
Definition: function/genome_locus.hpp:552
genesis::population::GenomeLocus::chromosome
std::string chromosome
Definition: genome_locus.hpp:66
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
ADD_LOCUS_COMPARISON_OVERLOADS
#define ADD_LOCUS_COMPARISON_OVERLOADS(CMP_FUNC)
Definition: function/genome_locus.hpp:90
sequence_dict.hpp
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::locus_less
bool locus_less(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Less than comparison (<) for two loci in a genome.
Definition: function/genome_locus.hpp:414
genesis::sequence::SequenceDict::index_of
size_t index_of(std::string const &name) const
Definition: sequence_dict.hpp:240
genesis::population::locus_inequal
bool locus_inequal(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Inequality comparison (!=) for two loci in a genome.
Definition: function/genome_locus.hpp:356
genesis::population::locus_less_or_equal
bool locus_less_or_equal(std::string const &l_chromosome, size_t l_position, std::string const &r_chromosome, size_t r_position)
Less than or equal comparison (<=) for two loci in a genome.
Definition: function/genome_locus.hpp:505