A toolkit for working with phylogenetic data.
v0.24.0
slr.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_REGRESSION_SLR_H_
2 #define GENESIS_UTILS_MATH_REGRESSION_SLR_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2018 Lucas Czech and HITS gGmbH
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
35 
36 #include <cassert>
37 #include <cmath>
38 #include <cstddef>
39 
40 namespace genesis {
41 namespace utils {
42 
43 // =================================================================================================
44 // Simple Linear Regression
45 // =================================================================================================
46 
55 {
56  double slope;
57  double intercept;
58 
59  double y( double x ) const
60  {
61  return slope * x + intercept;
62  }
63 };
64 
73 template <class ForwardIteratorA, class ForwardIteratorB>
75  ForwardIteratorA first_x, ForwardIteratorA last_x,
76  ForwardIteratorB first_y, ForwardIteratorB last_y
77 ) {
78  // Calculate means of x and y := Mean(x), Mean(y) in parallel.
79  double mean_x = 0.0;
80  double mean_y = 0.0;
81  size_t count = 0;
82  for_each_finite_pair( first_x, last_x, first_y, last_y, [&]( double val_x, double val_y ){
83  mean_x += val_x;
84  mean_y += val_y;
85  ++count;
86  });
87  if( count == 0 ) {
88  return { std::numeric_limits<double>::quiet_NaN(), std::numeric_limits<double>::quiet_NaN() };
89  }
90  assert( count > 0 );
91  mean_x /= static_cast<double>( count );
92  mean_y /= static_cast<double>( count );
93  assert( std::isfinite( mean_x ));
94  assert( std::isfinite( mean_y ));
95 
96  // Calculate Cov(x,y) := covariance of x and y, and Var(x) := variance of x.
97  double covariance = 0.0;
98  double variance_x = 0.0;
99  for_each_finite_pair( first_x, last_x, first_y, last_y, [&]( double val_x, double val_y ){
100  double const dx = val_x - mean_x;
101  double const dy = val_y - mean_y;
102  covariance += dx * dy;
103  variance_x += dx * dx;
104  });
105  assert( std::isfinite( covariance ));
106  assert( std::isfinite( variance_x ));
107 
108  // The linear parameters are slope := Cov(x,y) / Var(x), and intercept = Mean(y) - slope * Mean(x).
109  LinearFunction result;
110  result.slope = covariance / variance_x;
111  result.intercept = mean_y - result.slope * mean_x;
112  return result;
113 }
114 
124 template <class ForwardIteratorA, class ForwardIteratorB>
126  ForwardIteratorA first_x, ForwardIteratorA last_x,
127  ForwardIteratorB first_y, ForwardIteratorB last_y,
128  LinearFunction lin_fct
129 ) {
130  double error = 0.0;
131  size_t count = 0;
132 
133  for_each_finite_pair( first_x, last_x, first_y, last_y, [&]( double val_x, double val_y ){
134  double const e = val_y - lin_fct.y( val_x );
135  error += e * e;
136  ++count;
137  });
138 
139  if( count == 0 ) {
140  assert( error == 0.0 );
141  return error;
142  }
143  return error / static_cast<double>( count );
144 }
145 
154 template <class ForwardIteratorA, class ForwardIteratorB>
156  ForwardIteratorA first_x, ForwardIteratorA last_x,
157  ForwardIteratorB first_y, ForwardIteratorB last_y,
158  LinearFunction lin_fct
159 ) {
160  // Prepare mean of y and the count of valid value pairs.
161  double y_mean = 0.0;
162  size_t count = 0;
163 
164  // First get mean of y. We use the x iterator only to make sure that we skip invalid pairs.
165  for_each_finite_pair( first_x, last_x, first_y, last_y, [&]( double, double y_val ){
166  y_mean += y_val;
167  ++count;
168  });
169  y_mean /= static_cast<double>( count );
170 
171  // Edge case.
172  if( count == 0 ) {
173  return 0.0;
174  }
175 
176  // Prepare sums of squares.
177  double ss_err = 0.0;
178  double ss_tot = 0.0;
179  // double ss_reg = 0.0;
180 
181  // Iterate again, this time calculate the components needed.
182  for_each_finite_pair( first_x, last_x, first_y, last_y, [&]( double x_val, double y_val ){
183  double const y_hat = lin_fct.y( x_val );
184 
185  double const d_err = y_val - y_hat;
186  double const d_tot = y_val - y_mean;
187  // double const d_reg = y_hat - y_mean;
188 
189  ss_err += d_err * d_err;
190  ss_tot += d_tot * d_tot;
191  // ss_reg += d_reg * d_reg;
192  });
193 
194  auto const fvu = ( ss_err / ss_tot );
195  // auto const fvu = ( 1.0 - ss_reg / ss_tot );
196  assert( 0.0 <= fvu && fvu <= 1.0 );
197  return fvu;
198 }
199 
200 } // namespace utils
201 } // namespace genesis
202 
203 #endif // include guard
double mean_squared_error(ForwardIteratorA first_x, ForwardIteratorA last_x, ForwardIteratorB first_y, ForwardIteratorB last_y, LinearFunction lin_fct)
Calculate the mean squared error obtained from a linear fit of the input variables.
Definition: slr.hpp:125
double y(double x) const
Definition: slr.hpp:59
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
LinearFunction simple_linear_regression(ForwardIteratorA first_x, ForwardIteratorA last_x, ForwardIteratorB first_y, ForwardIteratorB last_y)
Simple linear regression, for predicting the dependent variable y given the independent variable x...
Definition: slr.hpp:74
double fraction_of_variance_unexplained(ForwardIteratorA first_x, ForwardIteratorA last_x, ForwardIteratorB first_y, ForwardIteratorB last_y, LinearFunction lin_fct)
Calculate the fraction of unexplained variance resulting from a linear fit of the input variables...
Definition: slr.hpp:155
void for_each_finite_pair(ForwardIteratorA first_a, ForwardIteratorA last_a, ForwardIteratorB first_b, ForwardIteratorB last_b, std::function< void(double value_a, double value_b)> execute)
Iterate two ranges of double values in parallel, and execute a function for each pair of values from ...
Definition: common.hpp:223
Data structer to keep the two parameters of a linear function: its slope, and its intercept...
Definition: slr.hpp:54