A toolkit for working with phylogenetic data.
v0.24.0
fastq_writer.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
38 
39 #include <cassert>
40 #include <fstream>
41 #include <sstream>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace sequence {
46 
47 // =================================================================================================
48 // Writing
49 // =================================================================================================
50 
51 void FastqWriter::write( Sequence const& sequence, std::shared_ptr<utils::BaseOutputTarget> target ) const
52 {
53  write_sequence( sequence, target->ostream() );
54 }
55 
56 void FastqWriter::write( SequenceSet const& sequence_set, std::shared_ptr<utils::BaseOutputTarget> target ) const
57 {
58  auto& os = target->ostream();
59  for( Sequence const& sequence : sequence_set ) {
60  write_sequence( sequence, os );
61  }
62 }
63 
64 void FastqWriter::write_sequence( Sequence const& seq, std::ostream& os ) const
65 {
66  // Write label.
67  os << "@" << seq.label() << "\n";
68 
69  // Write sequence. If needed, add new line at every line_length_ position.
70  if( line_length_ > 0 ) {
71  for( size_t i = 0; i < seq.length(); i += line_length_ ) {
72  // Write line_length_ many characters.
73  // (If the string is shorter, as many characters as possible are used.)
74  os << seq.sites().substr( i, line_length_ ) << "\n";
75  }
76  } else {
77  os << seq.sites() << "\n";
78  }
79 
80  // Write third line, repeat label if necessary.
81  if( repeat_label_ ) {
82  os << "+" << seq.label() << "\n";
83  } else {
84  os << "+\n";
85  }
86 
87  // Write phred quality score.
88  if( seq.phred_scores().size() == seq.sites().size() ) {
89 
90  // Default case: proper phred quality scores. We do a lot of string copies here (first,
91  // to get the scores in string form, then possibly for wrapping the lines), which is slow.
92  // For now, we do not need to write Fastq that often, so we can live with that.
93  // Can be optimized if needed. Same for the "const dummy scores" case below.
94  auto const scores = quality_encode_from_phred_score( seq.phred_scores() );
95  if( line_length_ > 0 ) {
96  for( size_t i = 0; i < scores.size(); i += line_length_ ) {
97  // Write line_length_ many characters.
98  // (If the string is shorter, as many characters as possible are used.)
99  os << scores.substr( i, line_length_ ) << "\n";
100  }
101  } else {
102  os << scores << "\n";
103  }
104 
105  } else if( seq.phred_scores().size() == 0 ) {
106 
107  // Special case: Sequence does not have phred quality scores.
108  // Either throw, or use const dummy scores.
109  if( fill_missing_quality_ == 255 ) {
110  throw std::runtime_error(
111  "Sequence without phred scores found. "
112  "Use FastqWriter::fill_missing_quality() to use dummy score values instead."
113  );
114  } else {
115 
116  // Slightly slow: Make a string, then put it to the stream.
117  // Too lazy for a faster implementation, as this will rarely be needed anyway.
118  auto const dummy_chr = quality_encode_from_phred_score( fill_missing_quality_ );
119  auto const dummy_str = std::string( seq.sites().size(), dummy_chr );
120  assert( dummy_str.size() == seq.sites().size() );
121 
122  if( line_length_ > 0 ) {
123  for( size_t i = 0; i < dummy_str.size(); i += line_length_ ) {
124  // Write line_length_ many characters.
125  // (If the string is shorter, as many characters as possible are used.)
126  os << dummy_str.substr( i, line_length_ ) << "\n";
127  }
128  } else {
129  os << dummy_str << "\n";
130  }
131  }
132 
133  } else {
134  // Error case.
135  throw std::runtime_error(
136  "Invalid Sequence with phred scores of different length than the sequence has sites."
137  );
138  }
139 }
140 
141 // =================================================================================================
142 // Properties
143 // =================================================================================================
144 
146 {
147  line_length_ = value;
148  return *this;
149 }
150 
152 {
153  return line_length_;
154 }
155 
157 {
158  fill_missing_quality_ = value;
159  return *this;
160 }
161 
163 {
164  return fill_missing_quality_;
165 }
166 
168 {
169  repeat_label_ = value;
170  return *this;
171 }
172 
174 {
175  return repeat_label_;
176 }
177 
178 } // namespace sequence
179 } // namespace genesis
void write_sequence(Sequence const &sequence, std::ostream &os) const
Write a single Sequence to an output stream in Fastq format.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
void write(Sequence const &sequence, std::shared_ptr< utils::BaseOutputTarget > target) const
Write a single Sequence to an output target, using the Fastq format.
std::vector< unsigned char > & phred_scores()
Provides functions for accessing the file system.
unsigned char fill_missing_quality() const
Get the current value to fill missing phred quality scores.
Store a set of Sequences.
size_t length() const
Return the length (number of sites) of this sequence.
char quality_encode_from_phred_score(unsigned char phred_score, bool clamp=true)
Encode a phred score into a quality char, using the Sanger convention.
Definition: quality.hpp:128
bool repeat_label() const
Get whether the setting to repeat the sequence identifier (label) on the third line is set...
size_t line_length() const
Get the current line length.