fst__pool__functions_8hpp_source.html

#ifndef GENESIS_POPULATION_FUNCTION_FST_POOL_FUNCTIONS_H_

#define GENESIS_POPULATION_FUNCTION_FST_POOL_FUNCTIONS_H_


/*

    Genesis - A toolkit for working with phylogenetic data.

    Copyright (C) 2014-2024 Lucas Czech


    This program is free software: you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation, either version 3 of the License, or

    (at your option) any later version.


    This program is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License

    along with this program.  If not, see <http://www.gnu.org/licenses/>.


    Contact:

    Lucas Czech <lczech@carnegiescience.edu>

    Department of Plant Biology, Carnegie Institution For Science

    260 Panama Street, Stanford, CA 94305, USA

*/


#include "genesis/population/filter/filter_stats.hpp"

#include "genesis/population/filter/filter_status.hpp"

#include "genesis/population/filter/sample_counts_filter.hpp"

#include "genesis/population/filter/variant_filter.hpp"

#include "genesis/population/function/fst_pool_karlsson.hpp"

#include "genesis/population/function/fst_pool_kofler.hpp"

#include "genesis/population/function/fst_pool_processor.hpp"

#include "genesis/population/function/fst_pool_unbiased.hpp"

#include "genesis/population/function/functions.hpp"

#include "genesis/population/function/window_average.hpp"

#include "genesis/population/variant.hpp"

#include "genesis/utils/containers/matrix.hpp"

#include "genesis/utils/containers/transform_iterator.hpp"


#include <algorithm>

#include <cassert>

#include <cmath>

#include <limits>

#include <stdexcept>

#include <string>

#include <utility>

#include <vector>


namespace genesis {

namespace population {


// =================================================================================================

//     Compute Helper

// =================================================================================================


// Can only be used with C++14, as we _need_ generic lambda expressions (with `auto` type-specifier

// in the parameter list). Otherwise, the type that is actually needed for the range iterators

// in the fst_functor() call in the function is literally impossible to capture. That is because

// the type contains a TransformIterator, which has a lambda as one of its template parameters,

// which hence cannot be named or captured in any form other than a lambda or an auto context...

// See https://stackoverflow.com/a/4846597/4184258 for details. See the version history of this

// file (which is currently named `genesis/population/functions/structure.hpp`) for the previous

// instance of this function that also worked under C++11, by simply copy-pasting the contents of

// compute_pairwise_f_st() to the functions where it is used...

// That was ungly, and this function is currently not needed for our downstream tools

// (e.g., grenedalf, see https://github.com/lczech/grenedalf), so we just leave it active here

// when C++14 is used for compiling.


#if __cplusplus >= 201402L


template<class ForwardIterator, typename FstFunctor>

utils::Matrix<double> compute_pairwise_f_st(

    ForwardIterator begin, ForwardIterator end,

    FstFunctor fst_functor

) {

    // With no data, return empty result.

    if( begin == end ) {

        return {};

    }


    // Now we know that there are entries in the rage. Use the first one to get the number of

    // base pair samples in the range. We later check that this is the same for each entry.

    // Use that size to initialize the resulting matrix.

    size_t const size = static_cast<std::vector<SampleCounts> const&>( *begin ).size();

    auto result = utils::Matrix<double>( size, size, 0.0 );


    // We use a lambda that returns a tranforming rage to select an entry at a given index

    // in the set of SampleCounts at the current iterator position.

    auto select_entry = [size]( ForwardIterator begin, ForwardIterator end, size_t index ){

        // Currently, in order to use Window here, we need to explicitly use std::vector<SampleCounts>

        // instead of the more generic T... Bit unfortunate, but good enough for now.

        // Will have to revisit later if we get to use cases where the SampleCounts are not stored

        // in a vector, but some other container.

        // using T = typename ForwardIterator::value_type;

        return utils::make_transform_range(

            [size, index]( std::vector<SampleCounts> const& samples ) -> SampleCounts const& {

                if( samples.size() != size ) {

                    throw std::runtime_error(

                        "In compute_pairwise_f_st(): The number of SampleCounts in the "

                        "provided range is not consistent throughout the iteration."

                    );

                }

                return samples[index];

            },

            begin, end

        );

    };


    // Loop over all pairs of entries, and compute f_st for each of these pairs.

    // That is, in the inner code of the two loops, we run the f_st function that takes

    // two ranges, providing it with a pair of indices for which we compute the value.

    for( size_t i = 0; i < size; ++i ) {

        for( size_t j = i + 1; j < size; ++j ) {

            auto range_i = select_entry( begin, end, i );

            auto range_j = select_entry( begin, end, j );

            auto const fst = fst_functor(

                i, j,

                range_i.begin(), range_i.end(),

                range_j.begin(), range_j.end()

            );

            result( i, j ) = fst;

            result( j, i ) = fst;

        }

    }


    return result;

}


#endif // __cplusplus >= 201402L


// =================================================================================================

//     F_ST Pool Kofler

// =================================================================================================


template<class ForwardIterator1, class ForwardIterator2>

double f_st_pool_kofler( // get_conventional_fstcalculator

    size_t p1_poolsize, size_t p2_poolsize,

    ForwardIterator1 p1_begin, ForwardIterator1 p1_end,

    ForwardIterator2 p2_begin, ForwardIterator2 p2_end,

    bool only_passing_samples = true

) {

    // Edge and error cases

    if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {

        return std::numeric_limits<double>::quiet_NaN();

        // throw std::invalid_argument( "Cannot run f_st_pool_kofler() with poolsizes <= 1" );

    }


    // Init the calculator.

    FstPoolCalculatorKofler calc{ p1_poolsize, p2_poolsize };


    // Iterate the two ranges in parallel. Each iteration is one position in the genome.

    // In each iteration, p1_it and p2_it point at SampleCounts objects containing nucleotide counts.

    auto p1_it = p1_begin;

    auto p2_it = p2_begin;

    while( p1_it != p1_end && p2_it != p2_end ) {

        if( only_passing_samples && ( !p1_it->status.passing() || !p2_it->status.passing() )) {

            continue;

        }


        calc.process( *p1_it, *p2_it );

        ++p1_it;

        ++p2_it;

    }

    if( p1_it != p1_end || p2_it != p2_end ) {

        throw std::invalid_argument(

            "In f_st_pool_kofler(): Provided ranges have different length."

        );

    }


    // Compute the final result.

    return calc.get_result();

}


#if __cplusplus >= 201402L


template<class ForwardIterator>

utils::Matrix<double> f_st_pool_kofler(

    std::vector<size_t> const& poolsizes,

    ForwardIterator begin, ForwardIterator end

) {

    return compute_pairwise_f_st(

        begin, end,

        [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){

            if( i >= poolsizes.size() || j >= poolsizes.size() ) {

                throw std::runtime_error(

                    "In f_st_pool_kofler(): Provided ranges have different lengths that "

                    "are not identical to the number of poolsizes provided."

                );

            }

            return f_st_pool_kofler(

                poolsizes[i], poolsizes[j],

                p1_begin, p1_end, p2_begin, p2_end

            );

        }

    );

}


#endif // __cplusplus >= 201402L


// =================================================================================================

//     F_ST Pool Karlsson

// =================================================================================================


template<class ForwardIterator1, class ForwardIterator2>

double f_st_pool_karlsson( // get_asymptunbiased_fstcalculator

    ForwardIterator1 p1_begin, ForwardIterator1 p1_end,

    ForwardIterator2 p2_begin, ForwardIterator2 p2_end,

    bool only_passing_samples = true

) {

    using namespace genesis::utils;


    // Init the calculator.

    FstPoolCalculatorKarlsson calc{};


    // Iterate both ranges, summing up N_k and D_k for all their entries.

    auto p1_it = p1_begin;

    auto p2_it = p2_begin;

    while( p1_it != p1_end && p2_it != p2_end ) {

        if( only_passing_samples && ( !p1_it->status.passing() || !p2_it->status.passing() )) {

            continue;

        }


        calc.process( *p1_it, *p2_it );

        ++p1_it;

        ++p2_it;

    }

    if( p1_it != p1_end || p2_it != p2_end ) {

        throw std::invalid_argument(

            "In f_st_pool_karlsson(): Provided ranges have different length."

        );

    }


    return calc.get_result();

}


#if __cplusplus >= 201402L


template<class ForwardIterator>

utils::Matrix<double> f_st_pool_karlsson(

    ForwardIterator begin, ForwardIterator end

) {

    return compute_pairwise_f_st(

        begin, end,

        [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){

            (void) i;

            (void) j;

            return f_st_pool_karlsson(

                p1_begin, p1_end, p2_begin, p2_end

            );

        }

    );

}


#endif // __cplusplus >= 201402L


// =================================================================================================

//     F_ST Pool Unbiased (Spence)

// =================================================================================================


template<class ForwardIterator1, class ForwardIterator2>

std::pair<double, double> f_st_pool_unbiased(

    size_t p1_poolsize, size_t p2_poolsize,

    ForwardIterator1 p1_begin, ForwardIterator1 p1_end,

    ForwardIterator2 p2_begin, ForwardIterator2 p2_end,

    bool only_passing_samples = true

) {

    // Edge and error cases

    if( p1_poolsize <= 1 || p2_poolsize <= 1 ) {

        return {

            std::numeric_limits<double>::quiet_NaN(),

            std::numeric_limits<double>::quiet_NaN()

        };

        // throw std::invalid_argument( "Cannot run f_st_pool_unbiased() with poolsizes <= 1" );

    }


    // Init the calculator.

    // For simplicity in this wrapper, we only allow to normalize the pi values per window

    // via their sum; in this function, we do not have the additionally needed information

    // on the Variant Filter Status statistics anyway. If that is needed, use FstPoolProcessor.

    FstPoolCalculatorUnbiased calc{ p1_poolsize, p2_poolsize, WindowAveragePolicy::kSum };


    // Iterate the two ranges in parallel. Each iteration is one position in the genome.

    // In each iteration, p1_it and p2_it point at SampleCounts objects containing nucleotide counts.

    auto p1_it = p1_begin;

    auto p2_it = p2_begin;

    while( p1_it != p1_end && p2_it != p2_end ) {

        if( only_passing_samples && ( !p1_it->status.passing() || !p2_it->status.passing() )) {

            continue;

        }


        calc.process( *p1_it, *p2_it );

        ++p1_it;

        ++p2_it;

    }

    if( p1_it != p1_end || p2_it != p2_end ) {

        throw std::invalid_argument(

            "In f_st_pool_unbiased(): Provided ranges have different length."

        );

    }


    // We use the result overload here that does not do window averaging, and just returns the sum.

    return calc.get_result_pair();

}


#if __cplusplus >= 201402L


template<class ForwardIterator>

utils::Matrix<double> f_st_pool_unbiased_nei(

    std::vector<size_t> const& poolsizes,

    ForwardIterator begin, ForwardIterator end

) {

    return compute_pairwise_f_st(

        begin, end,

        [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){

            if( i >= poolsizes.size() || j >= poolsizes.size() ) {

                throw std::runtime_error(

                    "In f_st_pool_unbiased_nei(): Provided ranges have different lengths that "

                    "are not identical to the number of poolsizes provided."

                );

            }

            return f_st_pool_unbiased(

                poolsizes[i], poolsizes[j],

                p1_begin, p1_end, p2_begin, p2_end

            ).first;

        }

    );

}


template<class ForwardIterator>

utils::Matrix<double> f_st_pool_unbiased_hudson(

    std::vector<size_t> const& poolsizes,

    ForwardIterator begin, ForwardIterator end

) {

    return compute_pairwise_f_st(

        begin, end,

        [&]( size_t i, size_t j, auto p1_begin, auto p1_end, auto p2_begin, auto p2_end ){

            if( i >= poolsizes.size() || j >= poolsizes.size() ) {

                throw std::runtime_error(

                    "In f_st_pool_unbiased_hudson(): Provided ranges have different lengths that "

                    "are not identical to the number of poolsizes provided."

                );

            }

            return f_st_pool_unbiased(

                poolsizes[i], poolsizes[j],

                p1_begin, p1_end, p2_begin, p2_end

            ).second;

        }

    );

}


#endif // __cplusplus >= 201402L


} // namespace population

} // namespace genesis


#endif // include guard