input__stream_8hpp_source.html

#ifndef GENESIS_UTILS_IO_INPUT_STREAM_H_

#define GENESIS_UTILS_IO_INPUT_STREAM_H_


/*

    Genesis - A toolkit for working with phylogenetic data.

    Copyright (C) 2014-2022 Lucas Czech


    This program is free software: you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or

    (at your option) any later version.


    This program is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License

    along with this program.  If not, see <http://www.gnu.org/licenses/>.


    Contact:

    Lucas Czech <lczech@carnegiescience.edu>

    Department of Plant Biology, Carnegie Institution For Science

    260 Panama Street, Stanford, CA 94305, USA

*/


#include "genesis/utils/core/std.hpp"

#include "genesis/utils/io/input_reader.hpp"

#include "genesis/utils/io/input_source.hpp"

#include "genesis/utils/text/char.hpp"


#include <cassert>

#include <cstdint>

#include <functional>

#include <memory>

#include <stdexcept>

#include <string>

#include <utility>


namespace genesis {

namespace utils {


// =================================================================================================

//     Input Stream

// =================================================================================================


class InputStream

{

public:


    // -------------------------------------------------------------------------

    //     Member Types

    // -------------------------------------------------------------------------


    static const size_t BlockLength = 1 << 22;


    using self_type         = InputStream;

    using value_type        = char;


    // -------------------------------------------------------------------------

    //     Constructors and Rule of Five

    // -------------------------------------------------------------------------


    InputStream()

        : source_name_( "invalid source" )

        , buffer_(   nullptr )

        , data_pos_( 0 )

        , data_end_( 0 )

        , current_( '\0' )

        , line_(     0 )

        , column_(   0 )

    {}


    explicit InputStream( std::shared_ptr<BaseInputSource> input_source )

        : line_(   1 )

        , column_( 1 )

    {

        init_( input_source );

    }


    ~InputStream()

    {

        delete[] buffer_;

        buffer_ = nullptr;

    }


    InputStream( self_type const& ) = delete;

    InputStream( self_type&& other )

        : buffer_( nullptr )

    {

        *this = std::move( other );

    }


    self_type& operator= ( self_type const& ) = delete;

    self_type& operator= ( self_type&& other );


    // -------------------------------------------------------------------------

    //     Stream Iterator Operations

    // -------------------------------------------------------------------------


    inline char operator* () const

    {

        return current_;

    }


    inline char current() const

    {

        if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {

            throw std::runtime_error(

                "Unexpected end of " + source_name() + " at " + at() + "."

            );

        }

        if( current_ < 0 ) GENESIS_UNLIKELY {

            throw std::domain_error(

                "Invalid input char in " + source_name() + " at " + at() + "."

            );

        }

        return current_;

    }


    inline self_type& advance()

    {

        operator++();

        return *this;

    }


    inline self_type& operator ++ ()

    {

        // If we were already at the end, set counter so zero.

        if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {

            reset_();

            return *this;

        }


        // Read data if necessary.

        if( data_pos_ >= BlockLength ) GENESIS_UNLIKELY {

            update_blocks_();

        }

        assert( data_pos_ < BlockLength );


        // In case we are moving to a new line, set the counters accordingly.

        if( current_ == '\n' ) {

            ++line_;

            column_ = 1;

        } else {

            ++column_;

        }


        // Next position.

        ++data_pos_;


        // Set the char.

        set_current_char_();

        return *this;

    }


    inline char get_char()

    {

        char ret = current_;

        operator++();

        return ret;

    }


    // -------------------------------------------------------------------------

    //     Line Operations

    // -------------------------------------------------------------------------


    void get_line( std::string& target );


    std::string get_line()

    {

        std::string result;

        get_line( result );

        return result;

    }


    // -------------------------------------------------------------------------

    //     Char Operations

    // -------------------------------------------------------------------------


    inline char read_char_or_throw( char const criterion )

    {

        // Check char and move to next.

        if( data_pos_ >= data_end_ || current_ != criterion ) GENESIS_UNLIKELY {

            throw std::runtime_error(

                std::string("In ") + source_name() + ": " +

                "Expecting " + char_to_hex( criterion ) + " at " + at() + ", " +

                "but received " + char_to_hex( current_ ) + " instead."

            );

        }

        assert( good() && current_ == criterion );

        operator++();

        return criterion;

    }


    inline char read_char_or_throw( std::function<bool (char)> criterion )

    {

        // Check char and move to next.

        if( data_pos_ >= data_end_ || !criterion( current_ )) GENESIS_UNLIKELY {

            throw std::runtime_error(

                std::string("In ") + source_name() + ": " +

                "Unexpected char " + char_to_hex( current_ ) + " at " + at() + "."

            );

        }

        assert( good() );

        auto const chr = current_;

        operator++();

        return chr;

    }


    // -------------------------------------------------------------------------

    //     Parsers

    // -------------------------------------------------------------------------


private:


    // Only use intrinsics version for the compilers that support them!

    #if defined(__GNUC__) || defined(__GNUG__) || defined(__clang__)


        size_t parse_unsigned_integer_gcc_intrinsic_();


        size_t parse_unsigned_integer_from_chars_();


    #endif


    // Only use C++17 code if we are compiled with that version.

    #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)


        size_t parse_unsigned_integer_std_from_chars_();


    #endif


    size_t parse_unsigned_integer_naive_();


    size_t parse_unsigned_integer_size_t_();


public:


    template<class T>

    T parse_unsigned_integer()

    {

        // No need to assert unsignedness here. We will later check that casting to the desired

        // type worked, and we test for the correct sign there as well, so that it workes for

        // signed types.

        // static_assert(

        //     std::is_unsigned<T>::value,

        //     "Need unsigned type for parse_unsigned_integer()"

        // );


        auto const x = parse_unsigned_integer_size_t_();


        // We parsed as largest, and now try to cast to desired type,

        // testing that back-conversion gives the same value and correct sign.

        auto const r = static_cast<T>(x);

        if( static_cast<size_t>(r) != x || r < 0 ) {

            throw std::overflow_error(

                "Numerical overflow in " + source_name() + " at " + at() + "."

            );

        }

        return r;

    }


    template<class T>

    T parse_signed_integer()

    {

        static_assert(

            std::is_signed<T>::value,

            "Need signed type for parse_signed_integer()"

        );


        if( data_pos_ >= data_end_ ) {

            throw std::runtime_error(

                "Expecting number in " + source_name() + " at " + at() + "."

            );

        }


        int sign = 1;

        if( current_ == '-' || current_ == '+' ) {

            if( current_ == '-' ) {

                sign = -1;

            }


            // Here we know that we are within limits of the buffer, and not at a new line.

            // Let's not use the expensive operator then.

            // operator++();

            assert( data_pos_ < data_end_ );

            assert( current_ != '\n' );

            ++column_;

            ++data_pos_;

            current_ = buffer_[ data_pos_ ];

        }


        // Parse as largest, and then try to cast to desired type,

        // testing that back-conversion gives the same value and sign.

        // The back-cast of `sign * r` is always valid, as the negative range of signed ints

        // is smaller than the positive, so if it's negative, multiplying by -1 will always result

        // in a valid value.

        auto const x = parse_unsigned_integer_size_t_();

        auto const r = sign * static_cast<T>(x);

        if( static_cast<size_t>( sign * r ) != x || !( r == 0 || (sign < 0) == (r < 0) )) {

            throw std::overflow_error(

                "Numerical overflow in " + source_name() + " at " + at() + "."

            );

        }

        return r;

    }


    template<class T>

    T parse_integer()

    {

        return parse_signed_integer<T>();

    }


    // -------------------------------------------------------------------------

    //     State

    // -------------------------------------------------------------------------


    size_t line() const

    {

        return line_;

    }


    size_t column() const

    {

        return column_;

    }


    std::string at() const

    {

        return std::to_string( line_ ) + ":" + std::to_string( column_ );

    }


    inline bool good() const

    {

        return data_pos_ < data_end_;

    }


    inline explicit operator bool() const

    {

        return data_pos_ < data_end_;

    }


    inline bool eof() const

    {

        return data_pos_ >= data_end_;

    }


    std::string source_name() const

    {

        return source_name_;

    }


    std::pair<char const*, size_t> buffer()

    {

        assert( data_pos_ <= data_end_ );

        return { &buffer_[ data_pos_ ], data_end_ - data_pos_ };

    }


    void jump_unchecked( size_t n )

    {

        // Safety first!

        if( data_pos_ + n >= data_end_ ) {

            throw std::runtime_error(

                "Invalid InputStream jump to position after buffer end."

            );

        }


        // Update the position as neeeded.

        data_pos_ += n;

        column_ += n;

        if( data_pos_ >= BlockLength ) {

            update_blocks_();

        }

        set_current_char_();

    }


    // -------------------------------------------------------------------------

    //     Internal Members

    // -------------------------------------------------------------------------


private:


    void init_( std::shared_ptr<BaseInputSource> input_source );


    void reset_()

    {

        line_    = 0;

        column_  = 0;

        current_ = '\0';

    }


    void update_blocks_();


    inline void set_current_char_()

    {

        // Check end of stream conditions.

        if( data_pos_ >= data_end_ ) GENESIS_UNLIKELY {

            // We do not expect to overshoot. Let's assert this, but if it still happens

            // (in release build), we can also cope, and will just set \0 as the current char.

            assert( data_pos_ == data_end_ );


            if( data_pos_ == data_end_ && data_pos_ > 0 && buffer_[ data_pos_ - 1 ] != '\n' ) {

                // If this is the end of the data, but there was no closing \n, add one.

                buffer_[ data_pos_ ] = '\n';

                ++data_end_;

            } else {

                // If we reached the end, do not fully reset the line and column counters.

                // They might be needed in some parser.

                current_ = '\0';

                return;

            }

        }


        // Treat stupid Windows and Mac lines breaks. Set them to \n, so that downstream parsers

        // don't have to deal with this.

        if( buffer_[ data_pos_ ] == '\r' ) {

            buffer_[ data_pos_ ] = '\n';


            // If this is a Win line break \r\n, skip one of them, so that only a single \n

            // is visible to the outside. We do not treat \n\r line breaks properly here!

            // If any system still uses those, we'd have to change code here.

            if( data_pos_ + 1 < data_end_ && buffer_[ data_pos_ + 1 ] == '\n' ) {

                ++data_pos_;

            }

        }


        // Set the char.

        current_ = buffer_[ data_pos_ ];

    }


    // -------------------------------------------------------------------------

    //     Data Members

    // -------------------------------------------------------------------------


private:


    // Input data comes from here...

    // (we use a unique ptr to make the class movable)

    std::unique_ptr<InputReader> input_reader_ = nullptr;

    std::string source_name_;


    // ...and is buffered here.

    char*  buffer_;

    size_t data_pos_;

    size_t data_end_;


    // Also, keep track of the current char and position in the data.

    char   current_;

    size_t line_;

    size_t column_;

};


// =================================================================================================

//     Template Specializations

// =================================================================================================


template<>

inline size_t InputStream::parse_unsigned_integer<size_t>()

{

    return parse_unsigned_integer_size_t_();

}


} // namespace utils

} // namespace genesis


#endif // include guard