A library for working with phylogenetic and population genetic data.
v0.32.0
fastq_writer.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@sund.ku.dk>
20  University of Copenhagen, Globe Institute, Section for GeoGenetics
21  Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
22 */
23 
32 
38 
39 #include <cassert>
40 #include <fstream>
41 #include <sstream>
42 #include <stdexcept>
43 
44 namespace genesis {
45 namespace sequence {
46 
47 // =================================================================================================
48 // Internal helper
49 // =================================================================================================
50 
56 template<class StringType>
58  StringType const& label,
59  StringType const& sites,
60  StringType const& quality,
61  size_t line_length,
62  bool repeat_label,
63  std::ostream& os
64 ) {
65  // This function is only called internally, with the correct sizes.
66  assert( sites.size() == quality.size() );
67 
68  // Helper function to write lines with potential wrapping.
69  auto write_wrapped_ = [&]( StringType const& str ){
70  if( line_length > 0 ) {
71  for( size_t i = 0; i < str.size(); i += line_length ) {
72  // Write line_length many characters.
73  // (If the string is shorter, as many characters as possible are used.)
74  // os << str.substr( i, line_length ) << "\n";
75  auto const sub = str.substr( i, line_length );
76  os.write( sub.data(), sub.size() );
77  os.write( "\n", 1 );
78  }
79  } else {
80  // os << str << "\n";
81  os.write( str.data(), str.size() );
82  os.write( "\n", 1 );
83  }
84  };
85 
86  // Write label.
87  // os << "@" << label << "\n";
88  os.write( "@", 1 );
89  os.write( label.data(), label.size() );
90  os.write( "\n", 1 );
91 
92  // Write sequence.
93  write_wrapped_( sites );
94 
95  // Write third line, repeat label if necessary.
96  if( repeat_label ) {
97  // os << "+" << label << "\n";
98  os.write( "+", 1 );
99  os.write( label.data(), label.size() );
100  os.write( "\n", 1 );
101  } else {
102  // os << "+\n";
103  os.write( "+\n", 2 );
104  }
105 
106  // Write the phred quality score.
107  write_wrapped_( quality );
108 }
109 
110 // =================================================================================================
111 // Writing
112 // =================================================================================================
113 
115  Sequence const& sequence,
116  std::shared_ptr<utils::BaseOutputTarget> target
117 ) const {
118  // Produce the phred quality score.
119  std::string quality_string;
120  if( sequence.phred_scores().size() == sequence.sites().size() ) {
121 
122  // Default case: proper phred quality scores. We do a lot of string copies here (first,
123  // to get the scores in string form, then possibly for wrapping the lines), which is slow.
124  // For now, we do not need to write Fastq that often, so we can live with that.
125  // Can be optimized if needed. Same for the "const dummy scores" case below.
126  quality_string = quality_encode_from_phred_score( sequence.phred_scores() );
127 
128  } else if( sequence.phred_scores().size() == 0 ) {
129 
130  // Make a string filled with the filler quality char.
131  quality_string = make_filled_quality_string_( sequence.sites().size() );
132 
133  } else {
134  // Error case.
135  throw std::runtime_error(
136  "Invalid Sequence with phred scores of different length than the sequence has sites."
137  );
138  }
139 
140  // Now write all of this to the stream
141  write_sequence_( sequence.label(), sequence.sites(), quality_string, target->ostream() );
142 }
143 
145  Sequence const& sequence,
146  std::string const& quality_string,
147  std::shared_ptr<utils::BaseOutputTarget> target
148 ) const {
149  // We want to avoid mistakes here of calling this function with a provided qualith string,
150  // in situations where the sequence itself already contains one.
151  if( ! sequence.phred_scores().empty() ) {
152  throw std::runtime_error(
153  "Cannot write Fastq sequence with provided quality string "
154  "if the sequence contains phred scores already."
155  );
156  }
157 
158  // Check that the quality string has the right lenght, or fill in otherwise.
159  if( quality_string.size() == sequence.sites().size() ) {
160  write_sequence_( sequence.label(), sequence.sites(), quality_string, target->ostream() );
161  } else if( quality_string.size() == 0 ) {
162  auto const filled_string = make_filled_quality_string_( sequence.sites().size() );
163  write_sequence_( sequence.label(), sequence.sites(), filled_string, target->ostream() );
164  } else {
165  throw std::runtime_error(
166  "Invalid given quality string of different length than the sequence has sites."
167  );
168  }
169 }
170 
172  SequenceSet const& sequence_set,
173  std::shared_ptr<utils::BaseOutputTarget> target
174 ) const {
175  for( Sequence const& sequence : sequence_set ) {
176  write( sequence, target );
177  }
178 }
179 
180 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
181 
182 void FastqWriter::write(
183  std::string_view const& label,
184  std::string_view const& sites,
185  std::string_view const& quality,
186  std::shared_ptr<utils::BaseOutputTarget> target
187 ) const {
188  // We need to make sure that a quality string is given, or filled in.
189  // We use an internal buffer to store the filled quality if needed, and redirect to there.
190  std::string quality_buffer;
191  std::string_view quality_view = quality;
192  if( quality.empty() ) {
193  quality_buffer = make_filled_quality_string_( sites.size() );
194  quality_view = std::string_view( quality_buffer );
195  } else if( quality.size() != sites.size() ) {
196  throw std::runtime_error(
197  "Invalid Sequence with quality string of different length than the sequence has sites."
198  );
199  }
200 
201  // Now write the data, using the view into either the original quality, or the buffer.
202  fastq_writer_write_sequence_helper_<std::string_view>(
203  label, sites, quality_view, line_length_, repeat_label_, target->ostream()
204  );
205 }
206 
207 #endif
208 
209 // =================================================================================================
210 // Internal Members
211 // =================================================================================================
212 
213 std::string FastqWriter::make_filled_quality_string_(
214  size_t length
215 ) const {
216  // Special case: Sequence does not have phred quality scores.
217  // Either throw, or use const dummy scores.
218  if( fill_missing_quality_ == 255 ) {
219  throw std::runtime_error(
220  "Sequence without phred scores found. "
221  "Use FastqWriter::fill_missing_quality() to use dummy score values instead."
222  );
223  }
224 
225  // Slightly slow: Make a string, then put it to the stream.
226  // Too lazy for a faster implementation, as this will rarely be needed anyway.
227  auto const dummy_chr = quality_encode_from_phred_score( fill_missing_quality_ );
228  auto quality_string = std::string( length, dummy_chr );
229  assert( quality_string.size() == length );
230  return quality_string;
231 }
232 
233 void FastqWriter::write_sequence_(
234  std::string const& label,
235  std::string const& sites,
236  std::string const& quality,
237  std::ostream& os
238 ) const {
239  fastq_writer_write_sequence_helper_<std::string>(
240  label, sites, quality, line_length_, repeat_label_, os
241  );
242 }
243 
244 } // namespace sequence
245 } // namespace genesis
fs.hpp
Provides functions for accessing the file system.
genesis::sequence::Sequence
Definition: sequence/sequence.hpp:40
output_stream.hpp
genesis::tree::length
double length(Tree const &tree)
Get the length of the tree, i.e., the sum of all branch lengths.
Definition: tree/common_tree/functions.cpp:160
sequence_set.hpp
fastq_writer.hpp
genesis::sequence::Sequence::label
std::string & label()
Definition: sequence/sequence.hpp:90
genesis::sequence::FastqWriter::write
void write(Sequence const &sequence, std::shared_ptr< utils::BaseOutputTarget > target) const
Write a single Sequence to an output target, using the Fastq format.
Definition: fastq_writer.cpp:114
genesis::sequence::fastq_writer_write_sequence_helper_
static void fastq_writer_write_sequence_helper_(StringType const &label, StringType const &sites, StringType const &quality, size_t line_length, bool repeat_label, std::ostream &os)
Local function template that does the actual work.
Definition: fastq_writer.cpp:57
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::sequence::SequenceSet
Store a set of Sequences.
Definition: sequence_set.hpp:53
genesis::sequence::Sequence::phred_scores
std::vector< unsigned char > & phred_scores()
Definition: sequence/sequence.hpp:130
genesis::sequence::Sequence::sites
std::string & sites()
Definition: sequence/sequence.hpp:110
genesis::sequence::quality_encode_from_phred_score
char quality_encode_from_phred_score(unsigned char phred_score, bool clamp=true)
Encode a phred score into a quality char, using the Sanger convention.
Definition: quality.hpp:141
sequence.hpp
quality.hpp