A library for working with phylogenetic and population genetic data.
v0.27.0
sam_flags.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2022 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
31 #ifdef GENESIS_HTSLIB
32 
34 
37 
38 #include <cassert>
39 #include <cstdint>
40 #include <cstring>
41 #include <stdexcept>
42 #include <unordered_map>
43 
44 extern "C" {
45  #include <htslib/sam.h>
46 }
47 
48 namespace genesis {
49 namespace population {
50 
51 // =================================================================================================
52 // Name to Value Map
53 // =================================================================================================
54 
58 static const std::unordered_map<std::string, int> sam_flag_name_to_int_ = {
59  { "paired", 0x1 },
60  { "properpair", 0x2 },
61  { "unmap", 0x4 },
62  { "unmapped", 0x4 },
63  { "munmap", 0x8 },
64  { "mateunmapped", 0x8 },
65  { "reverse", 0x10 },
66  { "mreverse", 0x20 },
67  { "matereverse", 0x20 },
68  { "read1", 0x40 },
69  { "read2", 0x80 },
70  { "secondary", 0x100 },
71  { "qcfail", 0x200 },
72  { "dup", 0x400 },
73  { "duplicate", 0x400 },
74  { "supplementary", 0x800 }
75 };
76 
77 // =================================================================================================
78 // Flag Helper Functions
79 // =================================================================================================
80 
81 int string_to_sam_flag( std::string const& value )
82 {
83  // Use the htslib internal parsing function, which takes care of numbers, and
84  // their spelling of the flag names (just upper case, underscores between words).
85  // See also parse_sam_flag?! not sure what it does or if its needed.
86  int result = bam_str2flag( value.c_str() );
87  if( result >= 0 ) {
88  return result;
89  }
90 
91  // We could check that the number is actually within the range of what can be specified
92  // with the sam flags. But htslib does not do that either, and we can just ignore any
93  // values that are larger than the flags.
94  // if( result >= 0 && result < 0x1000 - 1 ) {
95  // return result;
96  // }
97  // if( result >= 0x1000 - 1 ) {
98  // throw std::invalid_argument(
99  // "Invalid sam flag value \"" + value + "\" that is higher than the maximum flag value"
100  // );
101  // }
102 
103  // If the above fails, add our own more lenient parsing on top.
104  // We allow comma, space, and plus as delimiters, because why not.
105  auto const flags = utils::split( value, ",+| " );
106 
107  // Clean up all flags by removing non-alpha-numeric chars, and making it lower case.
108  // Then check in the map to get the value.
109  result = 0;
110  for( auto const& flag : flags ) {
111  auto cleaned_value = utils::to_lower_ascii( flag );
112  utils::erase_if( cleaned_value, []( char const c ){
113  return ! utils::is_alnum( c );
114  });
115 
116  // Try the sam function first, which takes care of all numeric values.
117  int const tmp = bam_str2flag( cleaned_value.c_str() );
118  if( tmp >= 0 ) {
119  result |= tmp;
120  continue;
121  }
122 
123  // If that did not work, we expect a name, and use our map to look it up.
124  if( sam_flag_name_to_int_.count( cleaned_value ) == 0 ) {
125  throw std::invalid_argument( "Invalid sam flag name \"" + flag + "\"" );
126  }
127  result |= sam_flag_name_to_int_.at( cleaned_value );
128  }
129  return result;
130 }
131 
132 std::string sam_flag_to_string( int flags )
133 {
134  char* str = bam_flag2str( flags );
135  auto res = std::string( str );
136  free( str );
137  return res;
138 }
139 
140 // =================================================================================================
141 // Validity check
142 // =================================================================================================
143 
144 static_assert(
145  static_cast<int>( 0x1 ) == BAM_FPAIRED,
146  "Definitions of BAM_FPAIRED in htslib and in genesis differ. "
147  "Please submit a bug report at https://github.com/lczech/genesis/issues"
148 );
149 static_assert(
150  static_cast<int>( 0x2 ) == BAM_FPROPER_PAIR,
151  "Definitions of BAM_FPROPER_PAIR in htslib and in genesis differ. "
152  "Please submit a bug report at https://github.com/lczech/genesis/issues"
153 );
154 static_assert(
155  static_cast<int>( 0x4 ) == BAM_FUNMAP,
156  "Definitions of BAM_FUNMAP in htslib and in genesis differ. "
157  "Please submit a bug report at https://github.com/lczech/genesis/issues"
158 );
159 static_assert(
160  static_cast<int>( 0x8 ) == BAM_FMUNMAP,
161  "Definitions of BAM_FMUNMAP in htslib and in genesis differ. "
162  "Please submit a bug report at https://github.com/lczech/genesis/issues"
163 );
164 static_assert(
165  static_cast<int>( 0x10 ) == BAM_FREVERSE,
166  "Definitions of BAM_FREVERSE in htslib and in genesis differ. "
167  "Please submit a bug report at https://github.com/lczech/genesis/issues"
168 );
169 static_assert(
170  static_cast<int>( 0x20 ) == BAM_FMREVERSE,
171  "Definitions of BAM_FMREVERSE in htslib and in genesis differ. "
172  "Please submit a bug report at https://github.com/lczech/genesis/issues"
173 );
174 static_assert(
175  static_cast<int>( 0x40 ) == BAM_FREAD1,
176  "Definitions of BAM_FREAD1 in htslib and in genesis differ. "
177  "Please submit a bug report at https://github.com/lczech/genesis/issues"
178 );
179 static_assert(
180  static_cast<int>( 0x80 ) == BAM_FREAD2,
181  "Definitions of BAM_FREAD2 in htslib and in genesis differ. "
182  "Please submit a bug report at https://github.com/lczech/genesis/issues"
183 );
184 static_assert(
185  static_cast<int>( 0x100 ) == BAM_FSECONDARY,
186  "Definitions of BAM_FSECONDARY in htslib and in genesis differ. "
187  "Please submit a bug report at https://github.com/lczech/genesis/issues"
188 );
189 static_assert(
190  static_cast<int>( 0x200 ) == BAM_FQCFAIL,
191  "Definitions of BAM_FQCFAIL in htslib and in genesis differ. "
192  "Please submit a bug report at https://github.com/lczech/genesis/issues"
193 );
194 static_assert(
195  static_cast<int>( 0x400 ) == BAM_FDUP,
196  "Definitions of BAM_FDUP in htslib and in genesis differ. "
197  "Please submit a bug report at https://github.com/lczech/genesis/issues"
198 );
199 static_assert(
200  static_cast<int>( 0x800 ) == BAM_FSUPPLEMENTARY,
201  "Definitions of BAM_FSUPPLEMENTARY in htslib and in genesis differ. "
202  "Please submit a bug report at https://github.com/lczech/genesis/issues"
203 );
204 
205 // static_assert(
206 // static_cast<int>( SamFlag::kPaired ) == BAM_FPAIRED,
207 // "Definitions of BAM_FPAIRED in htslib and of SamFlag::kPaired in genesis differ. "
208 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
209 // );
210 // static_assert(
211 // static_cast<int>( SamFlag::kProperPair ) == BAM_FPROPER_PAIR,
212 // "Definitions of BAM_FPROPER_PAIR in htslib and of SamFlag::kProperPair in genesis differ. "
213 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
214 // );
215 // static_assert(
216 // static_cast<int>( SamFlag::kUnmapped ) == BAM_FUNMAP,
217 // "Definitions of BAM_FUNMAP in htslib and of SamFlag::kUnmapped in genesis differ. "
218 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
219 // );
220 // static_assert(
221 // static_cast<int>( SamFlag::kMateUnmapped ) == BAM_FMUNMAP,
222 // "Definitions of BAM_FMUNMAP in htslib and of SamFlag::kMateUnmapped in genesis differ. "
223 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
224 // );
225 // static_assert(
226 // static_cast<int>( SamFlag::kReverse ) == BAM_FREVERSE,
227 // "Definitions of BAM_FREVERSE in htslib and of SamFlag::kReverse in genesis differ. "
228 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
229 // );
230 // static_assert(
231 // static_cast<int>( SamFlag::kMateReverse ) == BAM_FMREVERSE,
232 // "Definitions of BAM_FMREVERSE in htslib and of SamFlag::kMateReverse in genesis differ. "
233 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
234 // );
235 // static_assert(
236 // static_cast<int>( SamFlag::kRead1 ) == BAM_FREAD1,
237 // "Definitions of BAM_FREAD1 in htslib and of SamFlag::kRead1 in genesis differ. "
238 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
239 // );
240 // static_assert(
241 // static_cast<int>( SamFlag::kRead2 ) == BAM_FREAD2,
242 // "Definitions of BAM_FREAD2 in htslib and of SamFlag::kRead2 in genesis differ. "
243 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
244 // );
245 // static_assert(
246 // static_cast<int>( SamFlag::kSecondary ) == BAM_FSECONDARY,
247 // "Definitions of BAM_FSECONDARY in htslib and of SamFlag::kSecondary in genesis differ. "
248 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
249 // );
250 // static_assert(
251 // static_cast<int>( SamFlag::kQcFail ) == BAM_FQCFAIL,
252 // "Definitions of BAM_FQCFAIL in htslib and of SamFlag::kQcFail in genesis differ. "
253 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
254 // );
255 // static_assert(
256 // static_cast<int>( SamFlag::kDuplicate ) == BAM_FDUP,
257 // "Definitions of BAM_FDUP in htslib and of SamFlag::kDuplicate in genesis differ. "
258 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
259 // );
260 // static_assert(
261 // static_cast<int>( SamFlag::kSupplementary ) == BAM_FSUPPLEMENTARY,
262 // "Definitions of BAM_FSUPPLEMENTARY in htslib and of SamFlag::kSupplementary in genesis differ. "
263 // "Please submit a bug report at https://github.com/lczech/genesis/issues"
264 // );
265 
266 } // namespace population
267 } // namespace genesis
268 
269 #endif // htslib guard
algorithm.hpp
Provides some valuable algorithms that are not part of the C++ 11 STL.
sam_flags.hpp
genesis::utils::is_alnum
constexpr bool is_alnum(char c) noexcept
Return whether a char is a letter (a-z or A-Z) or a digit (0-9), ASCII-only.
Definition: char.hpp:143
string.hpp
Provides some commonly used string utility functions.
genesis::utils::erase_if
void erase_if(Container &c, UnaryPredicate p)
Erases all elements from the container that satisfy a given predicate. An element is erased,...
Definition: algorithm.hpp:107
genesis::population::sam_flag_name_to_int_
static const std::unordered_map< std::string, int > sam_flag_name_to_int_
Map from sam flags to their numerical value, for different types of naming of the flags.
Definition: sam_flags.cpp:58
genesis::population::string_to_sam_flag
int string_to_sam_flag(std::string const &value)
Parse a string as a set of flags for sam/bam/cram reads.
Definition: sam_flags.cpp:81
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::population::sam_flag_to_string
std::string sam_flag_to_string(int flags)
Turn a set of flags for sam/bam/cram reads into their textual representation.
Definition: sam_flags.cpp:132
genesis::utils::to_lower_ascii
std::string to_lower_ascii(std::string const &str)
Return an all-lowercase copy of the given string, ASCII-only.
Definition: string.cpp:668
genesis::utils::split
std::vector< std::string > split(std::string const &str, std::string const &delimiters, const bool trim_empty)
Spilt a string into parts, given a delimiters set of chars.
Definition: string.cpp:386