statistics_8hpp_source.html

#ifndef GENESIS_UTILS_MATH_STATISTICS_H_

#define GENESIS_UTILS_MATH_STATISTICS_H_


/*

    Genesis - A toolkit for working with phylogenetic data.

    Copyright (C) 2014-2024 Lucas Czech


    This program is free software: you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or

    (at your option) any later version.


    This program is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License

    along with this program.  If not, see <http://www.gnu.org/licenses/>.


    Contact:

    Lucas Czech <lczech@carnegiescience.edu>

    Department of Plant Biology, Carnegie Institution For Science

    260 Panama Street, Stanford, CA 94305, USA

*/


#include "genesis/utils/core/algorithm.hpp"

#include "genesis/utils/math/common.hpp"

#include "genesis/utils/math/ranking.hpp"


#include <algorithm>

#include <cassert>

#include <cmath>

#include <cstddef>

#include <functional>

#include <limits>

#include <stdexcept>

#include <utility>

#include <vector>


namespace genesis {

namespace utils {


// =================================================================================================

//     Static Assersions

// =================================================================================================


// We need to make sure that doubles and their infinities behave the way we expect!


static_assert(

    std::numeric_limits<double>::is_iec559,

    "IEC 559/IEEE 754 floating-point types required (wrong double type)."

);

static_assert(

    std::numeric_limits<double>::has_infinity,

    "IEC 559/IEEE 754 floating-point types required (does not have infinity)."

);

static_assert(

    std::numeric_limits<double>::has_quiet_NaN,

    "IEC 559/IEEE 754 floating-point types required (does not have quite NaN)."

);

static_assert(

    - std::numeric_limits<double>::infinity() < std::numeric_limits<double>::lowest(),

    "IEC 559/IEEE 754 floating-point types required (infinity is not the lowest value)."

);


// Clang fails to compile the following assertions, because of missing const expr markers

// in their std lib implementation. We hence need to skip those tests for clang :-(

// Hopefully, the above assertions are enough to cover the basics.


#ifndef __clang__


static_assert(

    std::isinf( - std::numeric_limits<double>::infinity() ),

    "IEC 559/IEEE 754 floating-point types required (infinity is not working properly)."

);

static_assert(

    std::isinf( -1 * std::numeric_limits<double>::infinity()),

    "IEC 559/IEEE 754 floating-point types required."

);

static_assert(

    -1 * std::numeric_limits<double>::infinity() < std::numeric_limits<double>::lowest(),

    "IEC 559/IEEE 754 floating-point types required."

);


#endif // __clang__


// =================================================================================================

//     Structures and Classes

// =================================================================================================


template< typename T >

struct MinMaxPair

{

    T min;

    T max;

};


struct MeanStddevPair

{

    double mean;

    double stddev;

};


struct Quartiles

{

    double q0 = 0.0;

    double q1 = 0.0;

    double q2 = 0.0;

    double q3 = 0.0;

    double q4 = 0.0;

};


// =================================================================================================

//     Standard Helper Functions

// =================================================================================================


template <class ForwardIterator>

std::pair<size_t, size_t> count_finite_elements( ForwardIterator first, ForwardIterator last )

{

    // Prepare result.

    size_t valid = 0;

    size_t total = 0;


    // Iterate.

    while( first != last ) {

        if( std::isfinite( *first ) ) {

            ++valid;

        }

        ++total;

        ++first;

    }


    return { valid, total };

}


template <class ForwardIterator>

double finite_minimum( ForwardIterator first, ForwardIterator last )

{

    // Prepare result.

    double min = std::numeric_limits<double>::max();

    size_t cnt = 0;


    // Iterate.

    while( first != last ) {

        if( std::isfinite( *first ) ) {

            if( *first < min ) {

                min = *first;

            }

            ++cnt;

        }

        ++first;

    }


    // If there are no valid elements, return nan.

    if( cnt == 0 ) {

        return std::numeric_limits<double>::quiet_NaN();

    }


    return min;

}


template <class ForwardIterator>

double finite_maximum( ForwardIterator first, ForwardIterator last )

{

    // Prepare result.

    double max = std::numeric_limits<double>::lowest();

    size_t cnt = 0;


    // Iterate.

    while( first != last ) {

        if( std::isfinite( *first ) ) {

            if( *first > max ) {

                max = *first;

            }

            ++cnt;

        }

        ++first;

    }


    // If there are no valid elements, return nan.

    if( cnt == 0 ) {

        return std::numeric_limits<double>::quiet_NaN();

    }


    return max;

}


template <class ForwardIterator>

MinMaxPair<double> finite_minimum_maximum( ForwardIterator first, ForwardIterator last )

{

    // Prepare result.

    MinMaxPair<double> result;

    result.min = std::numeric_limits<double>::max();

    result.max = std::numeric_limits<double>::lowest();

    size_t cnt = 0;


    // Iterate.

    while( first != last ) {

        if( std::isfinite( *first ) ) {

            if( *first < result.min ) {

                result.min = *first;

            }

            if( *first > result.max ) {

                result.max = *first;

            }

            ++cnt;

        }

        ++first;

    }


    // If there are no valid elements, return nan.

    if( cnt == 0 ) {

        result.min = std::numeric_limits<double>::quiet_NaN();

        result.max = std::numeric_limits<double>::quiet_NaN();

    }


    return result;

}


// =================================================================================================

//     Normalization and Compositional Data Analysis

// =================================================================================================


template <class ForwardIterator>

void closure( ForwardIterator first, ForwardIterator last )

{

    // Prepare result.

    double sum = 0.0;

    size_t cnt = 0;


    // Sum up elements.

    auto it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            if( *it < 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate closure of negative numbers."

                );

            }


            sum += *it;

            ++cnt;

        }

        ++it;

    }


    // If there are no valid elements, return.

    if( cnt == 0 ) {

        return;

    }


    // Make the closure.

    it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            *it /= sum;

        }

        ++it;

    }

}


inline void closure( std::vector<double>& vec )

{

    return closure( vec.begin(), vec.end() );

}


// =================================================================================================

//     Mean Stddev

// =================================================================================================


template <class ForwardIterator>

MeanStddevPair mean_stddev( ForwardIterator first, ForwardIterator last, double epsilon = -1.0 )

{

    // Prepare result.

    MeanStddevPair result;

    result.mean   = 0.0;

    result.stddev = 0.0;

    size_t count  = 0;


    // Sum up elements.

    auto it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            result.mean += *it;

            ++count;

        }

        ++it;

    }


    // If there are no valid elements, return an all-zero result.

    if( count == 0 ) {

        return result;

    }


    //  Calculate mean.

    result.mean /= static_cast<double>( count );


    // Calculate std dev.

    it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            result.stddev += (( *it - result.mean ) * ( *it - result.mean ));

        }

        ++it;

    }

    assert( count > 0 );

    result.stddev /= static_cast<double>( count );

    result.stddev = std::sqrt( result.stddev );


    // The following in an inelegant (but usual) way to handle near-zero values,

    // which later would cause a division by zero.

    assert( result.stddev >= 0.0 );

    if( result.stddev <= epsilon ){

        result.stddev = 1.0;

    }


    return result;

}


inline MeanStddevPair mean_stddev( std::vector<double> const& vec, double epsilon = -1.0 )

{

    return mean_stddev( vec.begin(), vec.end(), epsilon );

}


// =================================================================================================

//     Arithmetic Mean

// =================================================================================================


template <class ForwardIterator>

double arithmetic_mean( ForwardIterator first, ForwardIterator last )

{

    // Prepare result.

    double mean  = 0.0;

    size_t count = 0;


    // Sum up elements.

    auto it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            mean += *it;

            ++count;

        }

        ++it;

    }


    // If there are no valid elements, return an all-zero result.

    if( count == 0 ) {

        assert( mean == 0.0 );

        return mean;

    }


    //  Calculate mean.

    assert( count > 0 );

    return mean / static_cast<double>( count );

}


inline double arithmetic_mean( std::vector<double> const& vec )

{

    return arithmetic_mean( vec.begin(), vec.end() );

}


template <class ForwardIterator>

double weighted_arithmetic_mean(

    ForwardIterator first_value,  ForwardIterator last_value,

    ForwardIterator first_weight, ForwardIterator last_weight

) {

    double num = 0.0;

    double den = 0.0;

    size_t cnt = 0;


    // Multiply elements.

    for_each_finite_pair(

        first_value, last_value,

        first_weight, last_weight,

        [&]( double value, double weight ){

            if( weight < 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate weighted arithmetic mean with negative weights."

                );

            }


            num += weight * value;

            den += weight;

            ++cnt;

        }

    );


    // If there are no valid elements, return an all-zero result.

    if( cnt == 0 ) {

        return 0.0;

    }

    if( den == 0.0 ) {

        throw std::invalid_argument(

            "Cannot calculate weighted arithmetic mean with all weights being 0."

        );

    }


    // Return the result.

    assert( cnt > 0 );

    assert( den > 0.0 );

    return ( num / den );

}


inline double weighted_arithmetic_mean(

    std::vector<double> const& values,

    std::vector<double> const& weights

) {

    return weighted_arithmetic_mean( values.begin(), values.end(), weights.begin(), weights.end() );

}


// =================================================================================================

//     Geometric Mean

// =================================================================================================


template <class ForwardIterator>

double geometric_mean( ForwardIterator first, ForwardIterator last )

{

    double sum   = 0.0;

    size_t count = 0;


    // Iterate elements. For numeric stability, we use sum of logs instead of products;

    // otherwise, we run into overflows too quickly!

    auto it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            if( *it <= 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate geometric mean of non-positive numbers."

                );

            }

            sum += std::log( *it );

            ++count;

        }

        ++it;

    }


    // If there are no valid elements, return an all-zero result.

    if( count == 0 ) {

        return 0.0;

    }


    // Return the result.

    assert( count > 0 );

    assert( std::isfinite( sum ));

    return std::exp( sum / static_cast<double>( count ));

}


inline double geometric_mean( std::vector<double> const& vec )

{

    return geometric_mean( vec.begin(), vec.end() );

}


template <class ForwardIterator>

double weighted_geometric_mean(

    ForwardIterator first_value,  ForwardIterator last_value,

    ForwardIterator first_weight, ForwardIterator last_weight

) {

    double num = 0.0;

    double den = 0.0;

    size_t cnt = 0;


    // Multiply elements.

    for_each_finite_pair(

        first_value, last_value,

        first_weight, last_weight,

        [&]( double value, double weight ){

            if( value <= 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate weighted geometric mean of non-positive values."

                );

            }

            if( weight < 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate weighted geometric mean with negative weights."

                );

            }


            num += weight * std::log( value );

            den += weight;

            ++cnt;

        }

    );


    // If there are no valid elements, return an all-zero result.

    if( cnt == 0 ) {

        return 0.0;

    }

    if( den == 0.0 ) {

        throw std::invalid_argument(

            "Cannot calculate weighted geometric mean with all weights being 0."

        );

    }


    // Return the result.

    assert( cnt > 0 );

    assert( std::isfinite( num ));

    assert( std::isfinite( den ) && ( den > 0.0 ));

    return std::exp( num / den );

}


inline double weighted_geometric_mean(

    std::vector<double> const& values,

    std::vector<double> const& weights

) {

    return weighted_geometric_mean( values.begin(), values.end(), weights.begin(), weights.end() );

}


// =================================================================================================

//     Harmoic Mean

// =================================================================================================


enum class HarmonicMeanZeroPolicy

{

    kThrow,


    kIgnore,


    kReturnZero,


    kCorrection

};


template <class ForwardIterator>

double harmonic_mean(

    ForwardIterator first, ForwardIterator last,

    HarmonicMeanZeroPolicy zero_policy = HarmonicMeanZeroPolicy::kThrow

) {

    // Keep track of the total sum of inverses, the count of how many samples were used in total

    // (this excludes non-finite data points), and the number of zero value found, which is only

    // used with HarmonicMeanZeroPolicy::kCorrection

    double sum    = 0.0;

    size_t count  = 0;

    size_t zeroes = 0;


    // Iterate elements. For numeric stability, we use sum of logs instead of products;

    // otherwise, we run into overflows too quickly!

    auto it = first;

    while( it != last ) {

        if( std::isfinite( *it ) ) {

            if( *it < 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate harmonic mean of negative values."

                );

            }


            if( *it > 0.0 ) {

                sum += 1.0 / static_cast<double>( *it );

                ++count;

            } else {

                assert( *it == 0.0 );

                switch( zero_policy ) {

                    case HarmonicMeanZeroPolicy::kThrow: {

                        throw std::invalid_argument(

                            "Zero value found when calculating harmonic mean."

                        );

                    }

                    case HarmonicMeanZeroPolicy::kIgnore: {

                        // Do nothing.

                        break;

                    }

                    case HarmonicMeanZeroPolicy::kReturnZero: {

                        // If any value is zero, we do not need to finish the iteration.

                        return 0.0;

                    }

                    case HarmonicMeanZeroPolicy::kCorrection: {

                        // Increment both counters, but do not add anything to the sum.

                        ++count;

                        ++zeroes;

                        break;

                    }

                }

            }

        }

        ++it;

    }


    // If there are no valid elements, or all of them are zero, return an all-zero result.

    if( count == 0 || count == zeroes ) {

        return 0.0;

    }

    assert( count > 0 );

    assert( count > zeroes );

    assert( std::isfinite( sum ));


    // Return the result. We always compute the correction,

    // which however does not alter the result if not used.

    auto const correction = static_cast<double>( count - zeroes ) / static_cast<double>( count );

    return correction * static_cast<double>( count - zeroes ) / sum;

}


inline double harmonic_mean(

    std::vector<double> const& vec,

    HarmonicMeanZeroPolicy zero_policy = HarmonicMeanZeroPolicy::kThrow

) {

    return harmonic_mean( vec.begin(), vec.end(), zero_policy );

}


template <class ForwardIterator>

double weighted_harmonic_mean(

    ForwardIterator first_value,  ForwardIterator last_value,

    ForwardIterator first_weight, ForwardIterator last_weight,

    HarmonicMeanZeroPolicy zero_policy = HarmonicMeanZeroPolicy::kThrow

) {

    // Keep track of the numerator (sum of all weights of positive values) and denominator

    // (sum of weights divided by values) of the summation, as well as the sum of all weights

    // (which can be different from the numerator, if there are zero values), the total number of

    // values used, and the number of zero values found.

    double weights = 0.0;

    double num     = 0.0;

    double den     = 0.0;

    size_t count   = 0;

    size_t zeroes  = 0;


    // Multiply elements, only considering finite ones.

    for_each_finite_pair(

        first_value, last_value,

        first_weight, last_weight,

        [&]( double value, double weight ){

            if( value < 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate weighted harmonic mean of negative values."

                );

            }

            if( weight < 0.0 ) {

                throw std::invalid_argument(

                    "Cannot calculate weighted harmonic mean with negative weights."

                );

            }

            if( value > 0.0 ) {

                weights += weight;

                num     += weight;

                den     += weight / static_cast<double>( value );

                ++count;

            } else {

                assert( value == 0.0 );

                switch( zero_policy ) {

                    case HarmonicMeanZeroPolicy::kThrow: {

                        throw std::invalid_argument(

                            "Zero value found when calculating weighted harmonic mean."

                        );

                    }

                    case HarmonicMeanZeroPolicy::kIgnore: {

                        // Do nothing.

                        break;

                    }

                    case HarmonicMeanZeroPolicy::kReturnZero:

                    case HarmonicMeanZeroPolicy::kCorrection:{

                        // Increment the sum of all weights, so that zero values are contributing

                        // to the corrected result according to their weight, and increment

                        // both counters, but do not add anything to the sums.

                        // In case of the return zero policy, we use the zeroes counter as a flag

                        // indicating that we have found a zero value.

                        weights += weight;

                        ++count;

                        ++zeroes;

                        break;

                    }

                }

            }

        }

    );


    // If there are no valid elements, or all of them are zero, return an all-zero result.

    if( count == 0 || count == zeroes ) {

        return 0.0;

    }

    // For the return zero policy, if one of them is zero, return zero.

    if( zero_policy == HarmonicMeanZeroPolicy::kReturnZero && zeroes > 0 ) {

        return 0.0;

    }

    if( num == 0.0 || den == 0.0 ) {

        throw std::invalid_argument(

            "Cannot calculate weighted harmonic mean with all weights being 0."

        );

    }

    if( zeroes == 0 ) {

        (void) zeroes;

        assert( weights == num );

    }

    assert( count > 0 );

    assert( count > zeroes );

    assert( weights >= num );

    assert( std::isfinite( num ) && ( num > 0.0 ));

    assert( std::isfinite( den ) && ( den > 0.0 ));

    assert( std::isfinite( weights ) && ( weights > 0.0 ));


    // Return the result. We always compute the correction,

    // which however does not alter the result if not used.

    auto const correction = num / weights;

    return correction * num / den;

}


inline double weighted_harmonic_mean(

    std::vector<double> const& values,

    std::vector<double> const& weights,

    HarmonicMeanZeroPolicy zero_policy = HarmonicMeanZeroPolicy::kThrow

) {

    return weighted_harmonic_mean(

        values.begin(), values.end(),

        weights.begin(), weights.end(),

        zero_policy

    );

}


// =================================================================================================

//     Median

// =================================================================================================


template <class RandomAccessIterator>

double median( RandomAccessIterator first, RandomAccessIterator last )

{

    // Checks.

    if( ! std::is_sorted( first, last )) {

        throw std::runtime_error( "Range has to be sorted for median calculation." );

    }

    auto const size = static_cast<size_t>( std::distance( first, last ));

    if( size == 0 ) {

        return 0.0;

    }


    // Even or odd size? Median is calculated differently.

    if( size % 2 == 0 ) {


        // Get the two middle positions.

        size_t pl = size / 2 - 1;

        size_t pu = size / 2;

        assert( pl < size && pu < size );


        return ( *(first + pl) + *(first + pu) ) / 2.0;


    } else {


        // Int division, rounds down. This is what we want.

        size_t p = size / 2;

        assert( p < size );


        return *(first + p);

    }

}


inline double median( std::vector<double> const& vec )

{

    return median( vec.begin(), vec.end() );

}


// =================================================================================================

//     Quartiles

// =================================================================================================


template <class RandomAccessIterator>

Quartiles quartiles( RandomAccessIterator first, RandomAccessIterator last )

{

    // Prepare result.

    Quartiles result;


    // Checks.

    if( ! std::is_sorted( first, last )) {

        throw std::runtime_error( "Range has to be sorted for quartiles calculation." );

    }

    auto const size = static_cast<size_t>( std::distance( first, last ));

    if( size == 0 ) {

        return result;

    }


    // Set min, 50% and max.

    result.q0 = *first;

    result.q2 = median( first, last );

    result.q4 = *(first + size - 1);


    // Even or odd size? Quartiles are calculated differently.

    // This could be done shorter, but this way is more expressive.

    if( size % 2 == 0 ) {


        // Even: Split exaclty in halves.

        result.q1 = median( first, first + size / 2 );

        result.q3 = median( first + size / 2, first + size );


    } else {


        // Odd: Do not include the median value itself.

        result.q1 = median( first, first + size / 2 );

        result.q3 = median( first + size / 2 + 1, first + size );

    }


    return result;

}


inline Quartiles quartiles( std::vector<double> const& vec )

{

    return quartiles( vec.begin(), vec.end() );

}


// =================================================================================================

//     Dispersion

// =================================================================================================


inline double coefficient_of_variation( MeanStddevPair const& ms )

{

    return ms.stddev / ms.mean;

}


inline std::vector<double> coefficient_of_variation( std::vector<MeanStddevPair> const& ms )

{

    auto res = std::vector<double>( ms.size() );

    for( size_t i = 0; i < ms.size(); ++i ) {

        res[ i ] = coefficient_of_variation( ms[i] );

    }

    return res;

}


inline double index_of_dispersion( MeanStddevPair const& ms )

{

    return ms.stddev * ms.stddev / ms.mean;

}


inline std::vector<double> index_of_dispersion( std::vector<MeanStddevPair> const& ms )

{

    auto res = std::vector<double>( ms.size() );

    for( size_t i = 0; i < ms.size(); ++i ) {

        res[ i ] = index_of_dispersion( ms[i] );

    }

    return res;

}


inline double quartile_coefficient_of_dispersion( Quartiles const& q )

{

    return ( q.q3 - q.q1 ) / ( q.q3 + q.q1 );

}


inline std::vector<double> quartile_coefficient_of_dispersion( std::vector<Quartiles> const& q )

{

    auto res = std::vector<double>( q.size() );

    for( size_t i = 0; i < q.size(); ++i ) {

        res[ i ] = quartile_coefficient_of_dispersion( q[i] );

    }

    return res;

}


} // namespace utils

} // namespace genesis


#endif // include guard