A toolkit for working with phylogenetic data.
v0.24.0
gzip_stream.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2020 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
24 /*
25  The code in this file as well as the according header file lib/utils/io/gzip_stream.hpp are
26  adapted from the excellent zstr library (C++ header-only ZLib wrapper" classes) by Matei David,
27  see https://github.com/mateidavid/zstr
28 
29  We adapted the original code by renaming all classes and variables to our standards,
30  moving much of the implementation into a source file (so that the header does not clutter
31  its callers with zlib-internal symbols), and refining some functionality.
32 
33  For this and the according header file, we need to include the following original license:
34 
35  The MIT License (MIT)
36 
37  Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research
38 
39  Permission is hereby granted, free of charge, to any person obtaining a copy
40  of this software and associated documentation files (the "Software"), to deal
41  in the Software without restriction, including without limitation the rights
42  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
43  copies of the Software, and to permit persons to whom the Software is
44  furnished to do so, subject to the following conditions:
45 
46  The above copyright notice and this permission notice shall be included in all
47  copies or substantial portions of the Software.
48 
49  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
54  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
55  SOFTWARE.
56  */
57 
66 
67 #include <cassert>
68 #include <fstream>
69 #include <sstream>
70 #include <stdexcept>
71 #include <string>
72 
73 #ifdef GENESIS_ZLIB
74 
75 # include "zlib.h"
76 
77 # if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
78 # include <fcntl.h>
79 # include <io.h>
80 # endif
81 
82 #endif // GENESIS_ZLIB
83 
84 namespace genesis {
85 namespace utils {
86 
87 // We only include all the class definitions if we actually use zlib.
88 // If not, we later also provide dummy implementations that throw if instanciated.
89 #ifdef GENESIS_ZLIB
90 
91 // ================================================================================================
92 // Gzip Stream Wrapper
93 // ================================================================================================
94 
103 class GzipStreamWrapper
104  : public z_stream
105 {
106 public:
107 
108  GzipStreamWrapper( bool is_input = true, int level = Z_DEFAULT_COMPRESSION )
109  : is_input_(is_input)
110  {
111  this->zalloc = Z_NULL;
112  this->zfree = Z_NULL;
113  this->opaque = Z_NULL;
114  int ret;
115 
116  if( is_input_ ) {
117  this->avail_in = 0;
118  this->next_in = Z_NULL;
119  ret = inflateInit2( this, 15+32 );
120  } else {
121  if(
122  level < static_cast<int>( GzipCompressionLevel::kDefaultCompression ) ||
123  level > static_cast<int>( GzipCompressionLevel::kBestCompression )
124  ) {
125  throw std::invalid_argument(
126  "Compression level " + std::to_string( static_cast<int>(level )) +
127  " is invalid for usage in gzip output stream. Valid range is [ -1, 9 ]."
128  );
129  }
130  ret = deflateInit2( this, level, Z_DEFLATED, 15+16, 8, Z_DEFAULT_STRATEGY );
131  }
132 
133  if( ret != Z_OK ) {
134  throw except::GzipError( this->msg, ret );
135  }
136  }
137 
138  ~GzipStreamWrapper()
139  {
140  if( is_input_ ) {
141  inflateEnd(this);
142  } else {
143  deflateEnd(this);
144  }
145  }
146 
147 private:
148 
149  bool is_input_;
150 };
151 
152 // ================================================================================================
153 // Gzip Input Stream Buffer
154 // ================================================================================================
155 
174 class GzipIStreambuf
175  : public std::streambuf
176 {
177 public:
178 
179  // -------------------------------------------------------------
180  // Constructors and Rule of Five
181  // -------------------------------------------------------------
182 
183  GzipIStreambuf(
184  std::streambuf * sbuf_p,
185  bool auto_detect = true,
186  std::size_t buff_size = GZIP_DEFAULT_BUFFER_SIZE
187  )
188  : sbuf_p_(sbuf_p)
189  , zstrm_ptr_(nullptr)
190  , auto_detect_(auto_detect)
191  , buff_size_(buff_size)
192  , auto_detect_run_(false)
193  , is_text_(false)
194  {
195  assert(sbuf_p_);
196  in_buff_ = new char [buff_size_];
197  in_buff_start_ = in_buff_;
198  in_buff_end_ = in_buff_;
199  out_buff_ = new char [buff_size_];
200  setg(out_buff_, out_buff_, out_buff_);
201  }
202 
203  GzipIStreambuf( GzipIStreambuf const& ) = delete;
204  GzipIStreambuf( GzipIStreambuf && ) = delete;
205  GzipIStreambuf& operator = ( GzipIStreambuf const& ) = delete;
206  GzipIStreambuf& operator = ( GzipIStreambuf && ) = delete;
207 
208  virtual ~GzipIStreambuf()
209  {
210  delete [] in_buff_;
211  delete [] out_buff_;
212  if( zstrm_ptr_ ) {
213  delete zstrm_ptr_;
214  }
215  }
216 
217  // -------------------------------------------------------------
218  // Virtual functions
219  // -------------------------------------------------------------
220 
221  virtual std::streambuf::int_type underflow() override
222  {
223  if (this->gptr() == this->egptr()) {
224  // pointers for free region in output buffer
225  char * out_buff_free_start = out_buff_;
226 
227  do {
228  // read more input if none available
229  if (in_buff_start_ == in_buff_end_) {
230  // empty input buffer: refill from the start
231  in_buff_start_ = in_buff_;
232  std::streamsize sz = sbuf_p_->sgetn(in_buff_, buff_size_);
233  in_buff_end_ = in_buff_ + sz;
234  if (in_buff_end_ == in_buff_start_) break; // end of input
235  }
236 
237  // auto detect if the stream contains text or deflate data
238  if (auto_detect_ && ! auto_detect_run_) {
239  auto_detect_run_ = true;
240  unsigned char b0 = *reinterpret_cast< unsigned char * >(in_buff_start_);
241  unsigned char b1 = *reinterpret_cast< unsigned char * >(in_buff_start_ + 1);
242 
243  // Ref:
244  // http://en.wikipedia.org/wiki/Gzip
245  // http://stackoverflow.com/questions/9050260/what-does-a-zlib-header-look-like
246  is_text_ = ! (in_buff_start_ + 2 <= in_buff_end_
247  && ((b0 == 0x1F && b1 == 0x8B) // gzip header
248  || (b0 == 0x78 && (b1 == 0x01 // zlib header
249  || b1 == 0x9C
250  || b1 == 0xDA))));
251  }
252 
253  if (is_text_) {
254 
255  // simply swap in_buff_ and out_buff_, and adjust pointers
256  assert(in_buff_start_ == in_buff_);
257  std::swap(in_buff_, out_buff_);
258  out_buff_free_start = in_buff_end_;
259  in_buff_start_ = in_buff_;
260  in_buff_end_ = in_buff_;
261 
262  } else {
263 
264  // run inflate() on input
265  if (! zstrm_ptr_) {
266  zstrm_ptr_ = new GzipStreamWrapper(true);
267  }
268  zstrm_ptr_->next_in = reinterpret_cast< decltype(zstrm_ptr_->next_in) >(in_buff_start_);
269  zstrm_ptr_->avail_in = in_buff_end_ - in_buff_start_;
270  zstrm_ptr_->next_out = reinterpret_cast< decltype(zstrm_ptr_->next_out) >(out_buff_free_start);
271  zstrm_ptr_->avail_out = (out_buff_ + buff_size_) - out_buff_free_start;
272  int ret = inflate(zstrm_ptr_, Z_NO_FLUSH);
273 
274  // process return code
275  if (ret != Z_OK && ret != Z_STREAM_END) {
276  throw except::GzipError(zstrm_ptr_->msg, ret);
277  }
278 
279  // update in&out pointers following inflate()
280  in_buff_start_ = reinterpret_cast< decltype(in_buff_start_) >(zstrm_ptr_->next_in);
281  in_buff_end_ = in_buff_start_ + zstrm_ptr_->avail_in;
282  out_buff_free_start = reinterpret_cast< decltype(out_buff_free_start) >(zstrm_ptr_->next_out);
283  assert(out_buff_free_start + zstrm_ptr_->avail_out == out_buff_ + buff_size_);
284 
285  // if stream ended, deallocate inflator
286  if (ret == Z_STREAM_END) {
287  delete zstrm_ptr_;
288  zstrm_ptr_ = nullptr;
289  }
290  }
291  } while (out_buff_free_start == out_buff_);
292 
293  // 2 exit conditions:
294  // - end of input: there might or might not be output available
295  // - out_buff_free_start != out_buff_: output available
296  this->setg(out_buff_, out_buff_, out_buff_free_start);
297  }
298 
299  return this->gptr() == this->egptr()
300  ? traits_type::eof()
301  : traits_type::to_int_type(*this->gptr())
302  ;
303  }
304 
305  // -------------------------------------------------------------
306  // Members
307  // -------------------------------------------------------------
308 
309 private:
310 
311  std::streambuf * sbuf_p_;
312  char * in_buff_;
313  char * in_buff_start_;
314  char * in_buff_end_;
315  char * out_buff_;
316  GzipStreamWrapper * zstrm_ptr_;
317  bool auto_detect_;
318  std::size_t buff_size_;
319  bool auto_detect_run_;
320  bool is_text_;
321 };
322 
323 // ================================================================================================
324 // Gzip Output Stream Buffer
325 // ================================================================================================
326 
345 class GzipOStreambuf
346  : public std::streambuf
347 {
348 public:
349 
350  // -------------------------------------------------------------
351  // Constructors and Rule of Five
352  // -------------------------------------------------------------
353 
354  GzipOStreambuf(
355  std::streambuf * sbuf_p
356  )
357  : GzipOStreambuf( sbuf_p, Z_DEFAULT_COMPRESSION, GZIP_DEFAULT_BUFFER_SIZE )
358  {}
359 
360  GzipOStreambuf(
361  std::streambuf * sbuf_p,
362  int level,
363  std::size_t buff_size = GZIP_DEFAULT_BUFFER_SIZE
364  )
365  : sbuf_p_(sbuf_p)
366  , zstrm_ptr_(new GzipStreamWrapper(false, level))
367  , buff_size_(buff_size)
368  {
369  assert(sbuf_p_);
370  in_buff_ = new char [buff_size_];
371  out_buff_ = new char [buff_size_];
372  setp(in_buff_, in_buff_ + buff_size_);
373  }
374 
375  GzipOStreambuf( GzipOStreambuf const& ) = delete;
376  GzipOStreambuf( GzipOStreambuf &&) = delete;
377  GzipOStreambuf& operator = ( GzipOStreambuf const& ) = delete;
378  GzipOStreambuf& operator = ( GzipOStreambuf &&) = delete;
379 
380  virtual ~GzipOStreambuf()
381  {
382  // flush the zlib stream
383  //
384  // NOTE: Errors here (sync() return value not 0) are ignored, because we
385  // cannot throw in a destructor. This mirrors the behaviour of
386  // std::basic_filebuf::~basic_filebuf(). To see an exception on error,
387  // close the ofstream with an explicit call to close(), and do not rely
388  // on the implicit call in the destructor.
389  sync();
390 
391  delete [] in_buff_;
392  delete [] out_buff_;
393  delete zstrm_ptr_;
394  }
395 
396  // -------------------------------------------------------------
397  // Internal and Virtual Functions
398  // -------------------------------------------------------------
399 
400  virtual std::streambuf::int_type overflow(std::streambuf::int_type c = traits_type::eof()) override
401  {
402  zstrm_ptr_->next_in = reinterpret_cast< decltype(zstrm_ptr_->next_in) >(pbase());
403  zstrm_ptr_->avail_in = pptr() - pbase();
404  while (zstrm_ptr_->avail_in > 0) {
405  int r = deflate_loop_(Z_NO_FLUSH);
406  if (r != 0) {
407  setp(nullptr, nullptr);
408  return traits_type::eof();
409  }
410  }
411  setp(in_buff_, in_buff_ + buff_size_);
412  return traits_type::eq_int_type(c, traits_type::eof()) ? traits_type::eof() : sputc(c);
413  }
414 
415  virtual int sync() override
416  {
417  // first, call overflow to clear in_buff_
418  overflow();
419  if (! pptr()) {
420  return -1;
421  }
422 
423  // then, call deflate asking to finish the zlib stream
424  zstrm_ptr_->next_in = nullptr;
425  zstrm_ptr_->avail_in = 0;
426 
427  if (deflate_loop_(Z_FINISH) != 0) {
428  return -1;
429  }
430  deflateReset(zstrm_ptr_);
431  return 0;
432  }
433 
434 private:
435 
436  int deflate_loop_(int flush)
437  {
438  while( true ) {
439  zstrm_ptr_->next_out = reinterpret_cast< decltype(zstrm_ptr_->next_out) >(out_buff_);
440  zstrm_ptr_->avail_out = buff_size_;
441  int ret = deflate(zstrm_ptr_, flush);
442  if (ret != Z_OK && ret != Z_STREAM_END && ret != Z_BUF_ERROR) {
443  throw except::GzipError( zstrm_ptr_->msg, ret );
444  }
445 
446  std::streamsize sz = sbuf_p_->sputn(
447  out_buff_, reinterpret_cast< decltype(out_buff_) >(zstrm_ptr_->next_out) - out_buff_
448  );
449 
450  if (sz != reinterpret_cast< decltype(out_buff_) >(zstrm_ptr_->next_out) - out_buff_) {
451  // there was an error in the sink stream
452  return -1;
453  }
454 
455  if (ret == Z_STREAM_END || ret == Z_BUF_ERROR || sz == 0) {
456  break;
457  }
458  }
459  return 0;
460  }
461 
462  // -------------------------------------------------------------
463  // Members
464  // -------------------------------------------------------------
465 
466 private:
467 
468  std::streambuf * sbuf_p_;
469  char * in_buff_;
470  char * out_buff_;
471  GzipStreamWrapper * zstrm_ptr_;
472  std::size_t buff_size_;
473 };
474 
475 // ================================================================================================
476 // Gzip Input Stream
477 // ================================================================================================
478 
479 // We have all the implementation here, so that we do not need to expose the stream buffers.
480 
481 GzipIStream::GzipIStream( std::istream& is, bool auto_detect, std::size_t buffer_size )
482  : std::istream( new GzipIStreambuf( is.rdbuf(), auto_detect, buffer_size ))
483 {
484  exceptions(std::ios_base::badbit);
485 }
486 
487 GzipIStream::GzipIStream( std::streambuf* sbuf_p, bool auto_detect, std::size_t buffer_size )
488  : std::istream( new GzipIStreambuf( sbuf_p, auto_detect, buffer_size ))
489 {
490  exceptions(std::ios_base::badbit);
491 }
492 
494 {
495  delete rdbuf();
496 }
497 
498 // ================================================================================================
499 // Gzip Output Stream
500 // ================================================================================================
501 
502 // Let's make sure that our levels match the zlib levels. Pretty sure that zlib will never
503 // change their levels, but if they ever do, we want to know.
504 static_assert(
505  static_cast<int>(GzipCompressionLevel::kDefaultCompression) == Z_DEFAULT_COMPRESSION &&
506  static_cast<int>(GzipCompressionLevel::kNoCompression) == Z_NO_COMPRESSION &&
507  static_cast<int>(GzipCompressionLevel::kBestSpeed) == Z_BEST_SPEED &&
508  static_cast<int>(GzipCompressionLevel::kBestCompression) == Z_BEST_COMPRESSION,
509  "It seems that the zlib-internal compression levels values have changed "
510  "compared to the values that we internally use in genesis. This needs to be fixed, "
511  "please submit a bug report to https://github.com/lczech/genesis/issues"
512 );
513 
514 // We have all the implementation here, so that we do not need to expose the stream buffers.
515 
516 GzipOStream::GzipOStream( std::ostream& os, GzipCompressionLevel level, std::size_t buffer_size )
517  : std::ostream( new GzipOStreambuf( os.rdbuf(), static_cast<int>(level), buffer_size ))
518 {
519  exceptions(std::ios_base::badbit);
520 }
521 
522 GzipOStream::GzipOStream( std::streambuf* sbuf_p, GzipCompressionLevel level, std::size_t buffer_size )
523  : std::ostream( new GzipOStreambuf( sbuf_p, static_cast<int>(level), buffer_size ))
524 {
525  exceptions(std::ios_base::badbit);
526 }
527 
529 {
530  delete rdbuf();
531 }
532 
533 // ================================================================================================
534 // Gzip Input File Stream
535 // ================================================================================================
536 
538  std::string const& filename,
539  std::ios_base::openmode mode,
540  bool auto_detect,
541  std::size_t buffer_size
542 )
543  // Open in binary mode, which should also work for uncompressed files on Unix,
544  // but might break on Windowas, as it then does not do the line ending conversions.
545  // See https://github.com/mateidavid/zstr/issues/15
546  : StrictFStreamHolder<StrictIFStream>( filename, mode | std::ios_base::binary )
547  , std::istream( new GzipIStreambuf( file_stream_.rdbuf(), auto_detect, buffer_size ))
548 {
549  exceptions(std::ios_base::badbit);
550 }
551 
553 {
554  if (rdbuf()) {
555  delete rdbuf();
556  }
557 }
558 
559 // ================================================================================================
560 // Gzip Output File Stream
561 // ================================================================================================
562 
564  std::string const& filename,
565  std::ios_base::openmode mode,
566  GzipCompressionLevel level,
567  std::size_t buffer_size
568 )
569  : StrictFStreamHolder<StrictOFStream>( filename, mode | std::ios_base::binary )
570  , std::ostream( new GzipOStreambuf( file_stream_.rdbuf(), static_cast<int>(level), buffer_size ))
571 {
572  exceptions(std::ios_base::badbit);
573 }
574 
576 {
577  if (rdbuf()) {
578  delete rdbuf();
579  }
580 }
581 
583 {
584  std::ostream::flush();
585  file_stream_.flush();
586  return *this;
587 }
588 
589 // Up until here, we defined all classes needed for gzip streaming.
590 // However, if genesis is compiled without zlib support, we instead use dummy implementations
591 // which throw exceptions when being used.
592 #else // GENESIS_ZLIB
593 
594 // ================================================================================================
595 // Gzip Input Stream
596 // ================================================================================================
597 
598 GzipIStream::GzipIStream( std::istream&, bool, std::size_t )
599 {
600  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
601 }
602 
603 GzipIStream::GzipIStream( std::streambuf*, bool, std::size_t )
604 {
605  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
606 }
607 
609 {}
610 
611 // ================================================================================================
612 // Gzip Output Stream
613 // ================================================================================================
614 
615 GzipOStream::GzipOStream( std::ostream&, GzipCompressionLevel, std::size_t )
616 {
617  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
618 }
619 
620 GzipOStream::GzipOStream( std::streambuf*, GzipCompressionLevel, std::size_t )
621 {
622  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
623 }
624 
626 {}
627 
628 // ================================================================================================
629 // Gzip Input File Stream
630 // ================================================================================================
631 
632 GzipIFStream::GzipIFStream( std::string const&, std::ios_base::openmode, bool, std::size_t )
633 {
634  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
635 }
636 
638 {
639  // Nothing to do;
640 }
641 
642 // ================================================================================================
643 // Gzip Output File Stream
644 // ================================================================================================
645 
646 GzipOFStream::GzipOFStream( std::string const&, std::ios_base::openmode, GzipCompressionLevel, std::size_t )
647 {
648  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
649 }
650 
652 {
653  // Nothing to do;
654 }
655 
657 {
658  throw std::runtime_error( "zlib: Genesis was not compiled with zlib support." );
659 }
660 
661 #endif // GENESIS_ZLIB
662 
663 } // namespace utils
664 } // namespace genesis
Stream that defines a strict wrapper around std::ifstream.
GzipCompressionLevel
List of possible compression levels used for GzipOStream.
Stream that defines a strict wrapper around std::ofstream.
GzipOFStream & flush()
Flush, so one can save in the middle of writing a file for synchronization purposes.
void swap(SequenceSet &lhs, SequenceSet &rhs)
STL namespace.
Helper class template for managing the construction order between stream classes. ...
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
static const std::size_t GZIP_DEFAULT_BUFFER_SIZE
Default buffer size for all gzip (de)compression buffers.
GzipOStream(std::ostream &os, GzipCompressionLevel level=GzipCompressionLevel::kDefaultCompression, std::size_t buffer_size=GZIP_DEFAULT_BUFFER_SIZE)
Out file stream that offers on-the-fly gzip-compression.
std::shared_ptr< BaseOutputTarget > to_string(std::string &target_string)
Obtain an output target for writing to a string.
GzipIStream(std::istream &is, bool auto_detect=true, std::size_t buffer_size=GZIP_DEFAULT_BUFFER_SIZE)
GzipIFStream(std::string const &filename, std::ios_base::openmode mode=std::ios_base::in, bool auto_detect=true, std::size_t buffer_size=GZIP_DEFAULT_BUFFER_SIZE)
GzipOFStream(std::string const &filename, std::ios_base::openmode mode=std::ios_base::out, GzipCompressionLevel level=GzipCompressionLevel::kDefaultCompression, std::size_t buffer_size=GZIP_DEFAULT_BUFFER_SIZE)