A library for working with phylogenetic and population genetic data.
v0.32.0
gzip_input_source.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2023 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lczech@carnegiescience.edu>
20  Department of Plant Biology, Carnegie Institution For Science
21  260 Panama Street, Stanford, CA 94305, USA
22 */
23 
32 
35 
36 #include <cassert>
37 #include <cstdio>
38 #include <cstring>
39 #include <stdexcept>
40 
41 #ifdef GENESIS_ZLIB
42 
43 # include "zlib.h"
44 
45 # if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
46 # include <fcntl.h>
47 # include <io.h>
48 # endif
49 
50 #endif // GENESIS_ZLIB
51 
52 namespace genesis {
53 namespace utils {
54 
55 // =================================================================================================
56 // Zlib Data
57 // =================================================================================================
58 
59 #ifdef GENESIS_ZLIB
60 
64 struct GzipInputSource::ZlibData
65 {
66  // Zlib object. This can be destroyed and created multiple times, if the input data
67  // consists of multiple concatenated gzip streams. Hence, we need a flag to note
68  // whether the zlib stream is initialized or not.
69  // NB: `z_stream` is the name of the struct, as provided by zlib.h, while `zstream` is our
70  // variable name for the instance of that struct.
71  z_stream zstream;
72  bool initialized = false;
73 
74  // Input buffer, our current position in the buffer, and the past-the-end position
75  // (can be shorter than the buffer length, if there is not enough input).
76  // These are persistent, even if we destroyed the zstream above due to having reached
77  // the end of an input gzip part (for multiple concatenated gzip streams).
78  char in_buf[ BlockLength ];
79  size_t in_pos = 0;
80  size_t in_end = 0;
81 };
82 
83 #else // GENESIS_ZLIB
84 
88 struct GzipInputSource::ZlibData
89 {
90  // Empty on purpose.
91 };
92 
93 #endif // GENESIS_ZLIB
94 
95 // =================================================================================================
96 // Gzip Input Source: Functions with zlib
97 // =================================================================================================
98 
99 #ifdef GENESIS_ZLIB
100 
102  std::shared_ptr<BaseInputSource> input_source,
104 )
105  : input_source_( input_source )
106  , format_( format )
107  , format_name_( translate_format_( format ))
108  , zlib_data_(
109  // Create a new instance, and impute a destructor as well.
110  new ZlibData(),
111  []( ZlibData *impl ) { delete impl; }
112  )
113 {
114  create_zstream_();
115 }
116 
118 {
119  destroy_zstream_();
120 }
121 
122 void GzipInputSource::create_zstream_()
123 {
124  assert( zlib_data_ );
125  assert( zlib_data_->initialized == false );
126  auto& zstream = zlib_data_->zstream;
127 
128  // Init zlib inflate state
129  zstream = z_stream();
130  zstream.zalloc = Z_NULL;
131  zstream.zfree = Z_NULL;
132  zstream.opaque = Z_NULL;
133  zstream.avail_in = 0;
134  zstream.next_in = Z_NULL;
135 
136  // Init zlib
137  auto ret = inflateInit2( &zstream, get_format_( format_ ));
138  if( ret != Z_OK ) {
139  throw GzipError( zstream.msg, ret );
140  }
141 
142  zlib_data_->initialized = true;
143 }
144 
145 void GzipInputSource::destroy_zstream_()
146 {
147  // Call the zlib end function. In case this destroy function is called from the destructor,
148  // this is called before the inner zlib_data_ class is destroyed, so this is in correct order.
149  // Usually, we however already reached the end of the input stream, and hence already
150  // have ended the zlib object before. In that case, nothing to do here.
151 
152  assert( zlib_data_ );
153  if( zlib_data_->initialized ) {
154 
155  inflateEnd( &zlib_data_->zstream );
156  zlib_data_->initialized = false;
157  }
158 }
159 
160 size_t GzipInputSource::read_( char* buffer, size_t size )
161 {
162  // Shorthands to the data members.
163  assert( zlib_data_ );
164  auto& zstream = zlib_data_->zstream;
165  auto& in_buf = zlib_data_->in_buf;
166  auto& in_pos = zlib_data_->in_pos;
167  auto& in_end = zlib_data_->in_end;
168 
169  // How much have we already done, how much do we need to do, and where to put it.
170  // (The latter two are aliases for consistency of notation...)
171  size_t out_pos = 0;
172  size_t const out_end = size;
173  char* out_buf = buffer;
174 
175  // Inflate data until the output buffer is full with the desired number of bytes.
176  while( out_pos < out_end ) {
177 
178  // If the input buffer is already used up (or not yet read, in the beginning),
179  // read from the source.
180  if( in_pos >= in_end ) {
181  in_pos = 0;
182  in_end = input_source_->read( in_buf, BlockLength );
183  }
184  assert( in_end >= in_pos );
185  assert( out_end >= out_pos );
186 
187  // Reached end of input
188  if( in_pos == in_end ) {
189  break;
190  }
191 
192  // If we reached the end of a (partial) gzip stream in input that consists of concatenated
193  // gzip streams, we need to create a zlib instance again.
194  assert( zlib_data_ );
195  if( ! zlib_data_->initialized ) {
196  create_zstream_();
197  }
198 
199  // Read starting from the current input position, as much as there still is data.
200  // We use char data, but zlib expects unsigned char. So here, we cast in one direction,
201  // and in the output buffer, we again cast back. This doesn't change the byte content,
202  // so this is okay.
203  zstream.avail_in = static_cast<unsigned int>( in_end - in_pos );
204  zstream.next_in = reinterpret_cast<Bytef*>( in_buf ) + in_pos;
205 
206  // Write to the current output position, as much as there still is space.
207  zstream.avail_out = static_cast<unsigned int>( out_end - out_pos );
208  zstream.next_out = reinterpret_cast<Bytef*>( out_buf ) + out_pos;
209 
210  // Run.
211  auto ret = inflate( &zstream, Z_NO_FLUSH );
212 
213  // Error checks.
214  assert( ret != Z_STREAM_ERROR );
215  if( ret != Z_OK && ret != Z_STREAM_END ) {
216  if( ret == Z_NEED_DICT ) {
217  ret = Z_DATA_ERROR;
218  }
219  throw GzipError( zstream.msg, ret );
220  }
221 
222  // Update current positions.
223  in_pos = in_end - zstream.avail_in;
224  out_pos = out_end - zstream.avail_out;
225  assert( reinterpret_cast<char*>( zstream.next_in ) == in_buf + in_pos );
226  assert( reinterpret_cast<char*>( zstream.next_out ) == out_buf + out_pos );
227 
228  // Check if we reached the end of the input deflated stream. If so, this either means
229  // we have reached the valid end of the input data, or the input consists of multiple
230  // concatenated gzip streams. In the first case, we start one mor iteration of the loop,
231  // but will find the input empty and trigger the break condition there. In the latter case,
232  // destroying the zstream means that we are going to instanciate a new one in the next loop.
233  if( ret == Z_STREAM_END ) {
234  destroy_zstream_();
235  }
236  }
237 
238  // Either we filled up the whole buffer (and read size many bytes),
239  // or we reached the end of the input.
240  assert( out_pos == out_end || in_pos == in_end );
241 
242  // Return how many bytes we have put into the output buffer.
243  return out_pos;
244 }
245 
246 std::string GzipInputSource::source_string_() const
247 {
248  // Check if the extension is one that we want to remove.
249  auto const bn = file_basename( input_source_->source_string() );
250  auto const ex = file_extension( bn );
251 
252  // If so, use the full name again to get the complete path, but remove the extension.
253  if( ex == "gz" || ex == "gzip" || ex == "zlib" ) {
254  return file_filename( input_source_->source_string() );
255  }
256  return input_source_->source_string();
257 }
258 
259 int GzipInputSource::get_format_( GzipInputSource::Format format ) const
260 {
261  // Get the correct format int from the enum.
262  // We could use a typed enum, and directly use the enum to store the values,
263  // but this would require to include the zlib header in the header of this class,
264  // which we want to avoid.
265  switch( format ) {
266  case Format::kAutomatic:
267  return MAX_WBITS | 32;
268  case Format::kGzip:
269  return MAX_WBITS | 16;
270  case Format::kZlib:
271  return MAX_WBITS;
272  case Format::kDeflate:
273  return -MAX_WBITS;
274  default:
275  assert( false );
276  return 0;
277  }
278 }
279 
280 std::string GzipInputSource::translate_format_( GzipInputSource::Format format ) const
281 {
282  switch( format ) {
283  case Format::kAutomatic:
284  return "gzip/zlib";
285  case Format::kGzip:
286  return "gzip";
287  case Format::kZlib:
288  return "zlib";
289  case Format::kDeflate:
290  return "deflate";
291  default:
292  assert( false );
293  return "";
294  }
295 }
296 
297 // =================================================================================================
298 // Gzip Input Source: Functions without zlib
299 // =================================================================================================
300 
301 #else // GENESIS_ZLIB
302 
303 // Here, we define the class members as empty functions, throwing in the constructor.
304 // This is offered to be able to write code that mentions the class, without having to have zlib.
305 
307 {
308  // Empty on purpose.
309 
310  // For some weird reason, Doxygen messes up the documentation if this function comes after the
311  // contructor definition below. Probably due to Doxygen being unable to correctly parse that
312  // lambda in the initilizer. Anyway, by putting the destructor definition here, the Doxygen
313  // problem is solved.
314 }
315 
317  std::shared_ptr<BaseInputSource>,
319 )
320  : input_source_()
321  , format_name_()
322  , zlib_data_( nullptr, []( ZlibData* ){} )
323 {
324  // Just avoid doing anything really.
325  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
326 }
327 
328 void GzipInputSource::create_zstream_()
329 {
330  // Empty on purpose.
331 }
332 
333 void GzipInputSource::destroy_zstream_()
334 {
335  // Empty on purpose.
336 }
337 
338 size_t GzipInputSource::read_( char*, size_t )
339 {
340  return 0;
341 }
342 
343 std::string GzipInputSource::source_string_() const
344 {
345  return "";
346 }
347 
348 int GzipInputSource::get_format_( GzipInputSource::Format ) const
349 {
350  // Avoid compiler warnings: have to use format_ somewhere!
351  (void) format_;
352  return 0;
353 }
354 
355 std::string GzipInputSource::translate_format_( GzipInputSource::Format ) const
356 {
357  return "";
358 }
359 
360 #endif // GENESIS_ZLIB
361 
362 } // namespace utils
363 } // namespace genesis
gzip_input_source.hpp
genesis::utils::GzipInputSource::Format::kGzip
@ kGzip
Use gzip decompression.
fs.hpp
Provides functions for accessing the file system.
gzip.hpp
genesis::utils::GzipInputSource::Format::kZlib
@ kZlib
Use zlib decompression.
genesis::utils::file_filename
std::string file_filename(std::string const &filename)
Remove extension if present.
Definition: fs.cpp:811
genesis::utils::GzipError
Exception class thrown by failed gzip/zlib operations.
Definition: gzip.hpp:64
genesis::utils::GzipInputSource::~GzipInputSource
~GzipInputSource() override
Definition: gzip_input_source.cpp:117
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::GzipInputSource::GzipInputSource
GzipInputSource(std::shared_ptr< BaseInputSource > input_source, Format format=Format::kAutomatic)
Construct the input source using another input source (FileInputSource, StringInputSource,...
Definition: gzip_input_source.cpp:101
genesis::utils::GzipInputSource::Format::kDeflate
@ kDeflate
Use a pure deflate decompression.
genesis::utils::GzipInputSource::Format::kAutomatic
@ kAutomatic
Enable automatic header detection, allowing either gzip or zlib.
genesis::utils::GzipInputSource::Format
Format
Format used by gzip/zlib for decompression.
Definition: gzip_input_source.hpp:67
genesis::utils::file_basename
std::string file_basename(std::string const &filename)
Remove directory name from file name if present.
Definition: fs.cpp:788
genesis::utils::file_extension
std::string file_extension(std::string const &filename)
Return the extension name of a file.
Definition: fs.cpp:821