A toolkit for working with phylogenetic data.
v0.24.0
gzip_input_source.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
32 
35 
36 #include <cassert>
37 #include <cstdio>
38 #include <cstring>
39 #include <stdexcept>
40 
41 #ifdef GENESIS_ZLIB
42 
43 # include "zlib.h"
44 
45 # if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
46 # include <fcntl.h>
47 # include <io.h>
48 # endif
49 
50 #endif // GENESIS_ZLIB
51 
52 namespace genesis {
53 namespace utils {
54 
55 // =================================================================================================
56 // Zlib Data
57 // =================================================================================================
58 
59 #ifdef GENESIS_ZLIB
60 
64 struct GzipInputSource::ZlibData
65 {
66  // Zlib object
67  z_stream z_stream_;
68 
69  // Input buffer, our current position in the buffer, and the past-the-end position
70  // (can be shorter than the buffer length, if there is not enough input).
71  char in_buf_[ BlockLength ];
72  size_t in_pos_ = 0;
73  size_t in_end_ = 0;
74 };
75 
76 #else // GENESIS_ZLIB
77 
81 struct GzipInputSource::ZlibData
82 {
83  // Empty on purpose.
84 };
85 
86 #endif // GENESIS_ZLIB
87 
88 // =================================================================================================
89 // Gzip Input Source: Functions with zlib
90 // =================================================================================================
91 
92 #ifdef GENESIS_ZLIB
93 
95  std::shared_ptr<BaseInputSource> input_source,
97 )
98  : input_source_( input_source )
99  , format_name_( translate_format_( format ))
100  , zlib_data_(
101  new ZlibData(),
102  []( ZlibData *impl ) { delete impl; }
103  )
104 {
105  // Allocate zlib inflate state
106  auto& z_stream_ = zlib_data_->z_stream_;
107  z_stream_.zalloc = Z_NULL;
108  z_stream_.zfree = Z_NULL;
109  z_stream_.opaque = Z_NULL;
110  z_stream_.avail_in = 0;
111  z_stream_.next_in = Z_NULL;
112 
113  // Init zlib
114  auto ret = inflateInit2( &z_stream_, get_format_( format ));
115  if( ret != Z_OK ) {
116  throw except::GzipError( z_stream_.msg, ret );
117  }
118 }
119 
121 {
122  // Call the zlib destructor. This is called before the inner class is destroyed,
123  // so this is in correct order.
124  inflateEnd( &zlib_data_->z_stream_ );
125 }
126 
127 size_t GzipInputSource::read_( char* buffer, size_t size )
128 {
129  // Shorthands to the data members.
130  auto& z_stream_ = zlib_data_->z_stream_;
131  auto& in_buf_ = zlib_data_->in_buf_;
132  auto& in_pos_ = zlib_data_->in_pos_;
133  auto& in_end_ = zlib_data_->in_end_;
134 
135  // How much have we already done, how much do we need to do, and where to put it.
136  // (The latter two are aliases for consistency...)
137  size_t out_pos = 0;
138  size_t const out_end = size;
139  char* out_buf = buffer;
140 
141  // Inflate data until the output buffer is full with the desired number of bytes.
142  while( out_pos < out_end ) {
143 
144  // If the input buffer is already used up (or not yet read, in the beginning),
145  // read from the source.
146  if( in_pos_ >= in_end_ ) {
147  in_pos_ = 0;
148  in_end_ = input_source_->read( in_buf_, BlockLength );
149  }
150  assert( in_end_ >= in_pos_ );
151  assert( out_end >= out_pos );
152 
153  // Read starting from the current input position, as much as there still is data.
154  // We use char data, but zlib expects unsigned char. So here, we cast in one direction,
155  // and in the output buffer, we again cast back. This doesn't change the byte content,
156  // so this is okay.
157  z_stream_.avail_in = static_cast<unsigned int>( in_end_ - in_pos_ );
158  z_stream_.next_in = reinterpret_cast<Bytef*>( in_buf_ ) + in_pos_;
159 
160  // Write to the current output position, as much as there still is space.
161  z_stream_.avail_out = static_cast<unsigned int>( out_end - out_pos );
162  z_stream_.next_out = reinterpret_cast<Bytef*>( out_buf ) + out_pos;
163 
164  // Run.
165  auto ret = inflate( &z_stream_, Z_NO_FLUSH );
166 
167  // Error checks.
168  assert( ret != Z_STREAM_ERROR );
169  if( ret == Z_NEED_DICT ) {
170  ret = Z_DATA_ERROR;
171  }
172  switch( ret ) {
173  case Z_DATA_ERROR:
174  case Z_MEM_ERROR:
175  throw except::GzipError( z_stream_.msg, ret );
176  }
177 
178  // Update current positions.
179  in_pos_ = in_end_ - z_stream_.avail_in;
180  out_pos = out_end - z_stream_.avail_out;
181 
182  // Check if we reached the end of the input deflated stream
183  if( ret == Z_STREAM_END ) {
184 
185  // We only process whole gz files. So, if we reach the end of the deflate stream,
186  // we must also be at the end of the input.
187  assert( in_pos_ == in_end_ );
188 
189  break;
190  }
191  }
192 
193  // Either we filled up the whole buffer (and read size many bytes),
194  // or we reached the end of the input.
195  assert( out_pos == out_end || in_pos_ == in_end_ );
196 
197  // Return how many bytes we have out into the output buffer.
198  return out_pos;
199 }
200 
201 std::string GzipInputSource::source_string_() const
202 {
203  // Check if the extension is one that we want to remove.
204  auto const bn = file_basename( input_source_->source_string() );
205  auto const ex = file_extension( bn );
206 
207  // If so, use the full name again to get the complete path, but remove the extension.
208  if( ex == "gz" || ex == "gzip" || ex == "zlib" ) {
209  return file_filename( input_source_->source_string() );
210  }
211  return input_source_->source_string();
212 }
213 
214 int GzipInputSource::get_format_( GzipInputSource::Format format ) const
215 {
216  // Get the correct format int from the enum.
217  // We could use a typed enum, and directly use the enum to store the values,
218  // but this would require to include the zlib header in the header of this class,
219  // which we want to avoid.
220  switch( format ) {
221  case Format::kAutomatic:
222  return MAX_WBITS | 32;
223  case Format::kGzip:
224  return MAX_WBITS | 16;
225  case Format::kZlib:
226  return MAX_WBITS;
227  case Format::kDeflate:
228  return -MAX_WBITS;
229  default:
230  assert( false );
231  return 0;
232  }
233 }
234 
235 std::string GzipInputSource::translate_format_( GzipInputSource::Format format ) const
236 {
237  switch( format ) {
238  case Format::kAutomatic:
239  return "gzip/zlib";
240  case Format::kGzip:
241  return "gzip";
242  case Format::kZlib:
243  return "zlib";
244  case Format::kDeflate:
245  return "deflate";
246  default:
247  assert( false );
248  return "";
249  }
250 }
251 
252 // =================================================================================================
253 // Gzip Input Source: Functions without zlib
254 // =================================================================================================
255 
256 #else // GENESIS_ZLIB
257 
258 // Here, we define the class members as empty functions, throwing in the constructor.
259 // This is offered to be able to write code that mentions the class, without having to have zlib.
260 
262 {
263  // Empty on purpose.
264 
265  // For some weird reason, Doxygen messes up the documentation if this function comes after the
266  // contructor definition below. Probably due to Doxygen being unable to correctly parse that
267  // lambda in the initilizer. Anyway, by putting the destructor definition here, the Doxygen
268  // problem is solved.
269 }
270 
272  std::shared_ptr<BaseInputSource>,
274 )
275  : input_source_()
276  , format_name_()
277  , zlib_data_( nullptr, []( ZlibData* ){} )
278 {
279  // Just avoid doing anything really.
280  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
281 }
282 
283 size_t GzipInputSource::read_( char*, size_t )
284 {
285  return 0;
286 }
287 
288 std::string GzipInputSource::source_string_() const
289 {
290  return "";
291 }
292 
293 int GzipInputSource::get_format_( GzipInputSource::Format ) const
294 {
295  return 0;
296 }
297 
298 std::string GzipInputSource::translate_format_( GzipInputSource::Format ) const
299 {
300  return "";
301 }
302 
303 #endif // GENESIS_ZLIB
304 
305 } // namespace utils
306 } // namespace genesis
std::string file_extension(std::string const &filename)
Return the extension name of a file.
Definition: fs.cpp:707
std::string file_filename(std::string const &filename)
Remove extension if present.
Definition: fs.cpp:696
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
std::string file_basename(std::string const &filename)
Remove directory name from file name if present.
Definition: fs.cpp:685
GzipInputSource(std::shared_ptr< BaseInputSource > input_source, Format format=Format::kAutomatic)
Construct the input source using another input source (FileInputSource, StringInputSource, StreamInputSource, etc), and add gzip/zlib decompression on top, using the specified GzipInputSource::Format.
Provides functions for accessing the file system.
Exception class thrown by failed gzip/zlib operations.
Definition: gzip.hpp:67
Use a pure deflate decompression.
Enable automatic header detection, allowing either gzip or zlib.
Format
Format used by gzip/zlib for decompression.