1 #ifndef GENESIS_SEQUENCE_REFERENCE_GENOME_H_
2 #define GENESIS_SEQUENCE_REFERENCE_GENOME_H_
46 #include <unordered_map>
73 using iterator =
typename std::vector<Sequence>::iterator;
85 guard_ = genesis::utils::make_unique<std::mutex>();
105 return sequences_.size();
113 return sequences_.empty();
118 return lookup_.count( label );
129 std::lock_guard<std::mutex> lock( *guard_ );
132 assert( cache_ == std::numeric_limits<size_t>::max() || cache_ < sequences_.size() );
133 if( cache_ != std::numeric_limits<size_t>::max() && sequences_[cache_].label() == label ) {
134 assert( cache_ < sequences_.size() );
135 return sequences_.begin() + cache_;
139 auto lit = lookup_.find( label );
140 if( lit == lookup_.end() ) {
141 return sequences_.end();
143 assert( lit->first == label );
144 assert( lit->second < sequences_.size() );
145 cache_ = lit->second;
146 return sequences_.begin() + lit->second;
154 auto const it =
find( label );
155 if( it == sequences_.end() ) {
156 throw std::runtime_error(
157 "Reference Genome does not contain requested sequence \"" + label +
"\""
174 inline char get_base( std::string
const& chromosome,
size_t position,
bool to_upper =
true )
const
188 assert( it != sequences_.end() );
191 auto const& ref_seq = *it;
192 if( position == 0 || position - 1 >= ref_seq.length() ) {
193 throw std::runtime_error(
194 "Reference Genome sequence \"" + ref_seq.label() +
"\" is " +
196 " bases long, which is shorter than then requested (1-based) position " +
197 std::to_string( position ) +
". This likely means that some input data contains " +
198 "positions beyond the length of the reference genome, which is invalid."
201 assert( position - 1 < ref_seq.length() );
205 return ref_seq[ position - 1 ];
224 return add(
Sequence{seq}, also_look_up_first_word );
235 auto const label1 = seq.label();
236 if( lookup_.count( label1 ) > 0 ) {
237 throw std::runtime_error(
238 "Reference Genome already contains sequence name \"" + label1 +
"\", "
239 "which cannot be added again."
242 assert( lookup_.count( label1 ) == 0 );
247 auto const label2 =
utils::split( seq.label(),
"\t " )[0];
248 if( also_look_up_first_word && lookup_.count( label2 ) > 0 ) {
249 throw std::runtime_error(
250 "Reference Genome already contains sequence name \"" + label2 +
"\", "
251 "which is the shortened version of the original name \"" + label1 +
"\"."
259 std::lock_guard<std::mutex> lock( *guard_ );
262 sequences_.push_back( std::move(seq) );
263 cache_ = std::numeric_limits<size_t>::max();
269 assert( sequences_.size() > 0 );
270 lookup_[ label1 ] = sequences_.size() - 1;
271 assert( lookup_.count( label1 ) > 0 );
272 if( also_look_up_first_word ) {
273 lookup_[ label2 ] = sequences_.size() - 1;
274 assert( lookup_.count( label2 ) > 0 );
278 return sequences_.back();
287 std::lock_guard<std::mutex> lock( *guard_ );
290 cache_ = std::numeric_limits<size_t>::max();
299 return sequences_.cbegin();
304 return sequences_.cend();
309 return sequences_.cbegin();
314 return sequences_.cend();
324 std::vector<Sequence> sequences_;
325 std::unordered_map<std::string, size_t> lookup_;
330 mutable size_t cache_;
331 mutable std::unique_ptr<std::mutex> guard_;
338 #endif // include guard