A library for working with phylogenetic and population genetic data.
v0.32.0
glm.hpp
Go to the documentation of this file.
1 #ifndef GENESIS_UTILS_MATH_REGRESSION_GLM_H_
2 #define GENESIS_UTILS_MATH_REGRESSION_GLM_H_
3 
4 /*
5  Genesis - A toolkit for working with phylogenetic data.
6  Copyright (C) 2014-2024 Lucas Czech
7 
8  This program is free software: you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation, either version 3 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20 
21  Contact:
22  Lucas Czech <lucas.czech@h-its.org>
23  Exelixis Lab, Heidelberg Institute for Theoretical Studies
24  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
25 */
26 
37 
38 #include <utility>
39 #include <vector>
40 
41 namespace genesis {
42 namespace utils {
43 
44 // =================================================================================================
45 // GLM Data Structures
46 // =================================================================================================
47 
48 struct GlmExtras
49 {
50  std::vector<double> initial_fittings;
51  std::vector<double> prior_weights;
52  bool with_intercept = true;
53 
57  std::vector<size_t> strata;
58 
60  {
64  };
65 
66  ResidualType residual_type = ResidualType::kDefault;
67 
76  bool mean_deviance = false;
77 };
78 
79 struct GlmControl
80 {
84  size_t max_iterations = 25;
85 
90  double epsilon = 1.e-5;
91 
97  double max_r2 = 0.99;
98 };
99 
100 struct GlmOutput
101 {
102  bool converged = false;
103  size_t num_iterations = 0;
104 
108  size_t rank = 0;
109 
113  size_t df_resid = 0;
114 
118  double scale = 1.0;
119 
124 
128  std::vector<double> fitted;
129 
133  std::vector<double> resid;
134 
138  std::vector<double> weights;
139 
143  std::vector<double> which;
144 
150  std::vector<double> betaQ;
151 
156  std::vector<double> tri;
157 
165  double null_deviance = 0.0;
166 
174  double deviance = 0.0;
175 };
176 
177 // =================================================================================================
178 // GLM Fit
179 // =================================================================================================
180 
188  Matrix<double> const& x_predictors,
189  std::vector<double> const& y_response,
190  GlmFamily const& family,
191  GlmLink const& link,
192  GlmExtras const& extras = {},
193  GlmControl const& control = {}
194 );
195 
204 GlmOutput glm_fit(
205  Matrix<double> const& x_predictors,
206  std::vector<double> const& y_response,
207  GlmFamily const& family,
208  GlmExtras const& extras = {},
209  GlmControl const& control = {}
210 );
211 
218 GlmOutput glm_fit(
219  Matrix<double> const& x_predictors,
220  std::vector<double> const& y_response,
221  GlmExtras const& extras = {},
222  GlmControl const& control = {}
223 );
224 
225 // =================================================================================================
226 // GLM Output
227 // =================================================================================================
228 
237 std::vector<double> glm_estimate_betas( GlmOutput const& output );
238 
239 // /**
240 // * @brief Obtain beta estimates and variance covariance matrix of estimates from output the output
241 // * of glm_fit().
242 // *
243 // * The resulting variance covariance matrix is a packed symmetric matrix with the size of the
244 // * number of predictor variables (which is the size of the betas).
245 // * Robust variance is calculated if the "meat" matrix for the information sandwich is supplied.
246 // */
247 // std::pair<std::vector<double>, std::vector<double>> glm_estimate_betas_and_var_covar(
248 // GlmOutput const& output,
249 // std::vector<double> const& meat = std::vector<double>{}
250 // );
251 
262  Matrix<double> const& x_predictors,
263  std::vector<double> const& y_response,
264  GlmOutput const& output,
265  std::vector<double> const& betas
266 );
267 
285  Matrix<double> const& x_predictors,
286  std::vector<double> const& y_response,
287  GlmLink const& link,
288  GlmOutput const& output,
289  std::vector<double> const& betas
290 );
291 
303 std::vector<double> glm_coefficients(
304  Matrix<double> const& x_predictors,
305  std::vector<double> const& y_response,
306  GlmOutput const& output
307 );
308 
319 std::vector<double> glm_coefficients(
320  Matrix<double> const& x_predictors,
321  std::vector<double> const& y_response,
322  GlmLink const& link,
323  GlmOutput const& output
324 );
325 
326 } // namespace utils
327 } // namespace genesis
328 
329 #endif // include guard
genesis::utils::GlmExtras::mean_deviance
bool mean_deviance
Calculate mean null_deviance and mean deviance instead of their sums.
Definition: glm.hpp:76
genesis::utils::GlmExtras::kPearsonResiduals
@ kPearsonResiduals
Definition: glm.hpp:62
genesis::utils::glm_coefficients
std::vector< double > glm_coefficients(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmOutput const &output)
Compute the model coefficients of a glm_fit().
Definition: glm.cpp:775
genesis::utils::GlmOutput::rank
size_t rank
Rank of X after regression on strata.
Definition: glm.hpp:108
genesis::utils::GlmOutput
Definition: glm.hpp:100
genesis::utils::GlmOutput::scale
double scale
Scale factor (scalar).
Definition: glm.hpp:118
genesis::utils::GlmFamily
Definition: family.hpp:48
genesis::utils::GlmOutput::num_iterations
size_t num_iterations
Definition: glm.hpp:103
genesis::utils::GlmControl::max_iterations
size_t max_iterations
Maximum number of iterations to run the IRLS algorithm for (if needed).
Definition: glm.hpp:84
genesis::utils::GlmExtras
Definition: glm.hpp:48
genesis::utils::glm_fit
GlmOutput glm_fit(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control)
Fit a Generalized Linear Model (GLM).
Definition: glm.cpp:355
genesis::utils::GlmOutput::Xb
Matrix< double > Xb
Orthogonal basis for X space (N * M matrix, with N * rank being used).
Definition: glm.hpp:123
genesis::utils::GlmExtras::ResidualType
ResidualType
Definition: glm.hpp:59
genesis::utils::GlmControl
Definition: glm.hpp:79
genesis::utils::Matrix< double >
genesis::utils::glm_estimate_intercept
double glm_estimate_intercept(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmOutput const &output, std::vector< double > const &betas)
Compute the intercept resulting from a glm_fit().
Definition: glm.cpp:714
genesis::utils::GlmOutput::null_deviance
double null_deviance
Null deviance.
Definition: glm.hpp:165
genesis::utils::GlmOutput::tri
std::vector< double > tri
Upper unit triangular transformation matrix, with Xb - tr.Xb placed in the diagonal (size (M * (M+1))...
Definition: glm.hpp:156
genesis::utils::GlmExtras::residual_type
ResidualType residual_type
Definition: glm.hpp:66
genesis::utils::GlmExtras::strata
std::vector< size_t > strata
Strata assignments coded 1...S.
Definition: glm.hpp:57
genesis::utils::GlmOutput::weights
std::vector< double > weights
Weights (size N)
Definition: glm.hpp:138
genesis::utils::GlmOutput::deviance
double deviance
Deviance.
Definition: glm.hpp:174
genesis::utils::GlmControl::max_r2
double max_r2
Threshold for singluarities. Internally used as eta = 1.0 - max_r2.
Definition: glm.hpp:97
matrix.hpp
genesis::utils::GlmOutput::converged
bool converged
Definition: glm.hpp:102
genesis::utils::GlmExtras::initial_fittings
std::vector< double > initial_fittings
Definition: glm.hpp:50
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::GlmControl::epsilon
double epsilon
Proportional change in weighted sum of squares residuals to declare convergence between two iteration...
Definition: glm.hpp:90
genesis::utils::GlmOutput::df_resid
size_t df_resid
Residual degrees of freedom.
Definition: glm.hpp:113
genesis::utils::glm_estimate_betas
std::vector< double > glm_estimate_betas(GlmOutput const &output)
Compute the beta estimates resulting from a glm_fit().
Definition: glm.cpp:605
family.hpp
genesis::utils::GlmOutput::which
std::vector< double > which
Which columns in the X matrix were estimated (first = 0) (size M).
Definition: glm.hpp:143
genesis::utils::GlmExtras::kDefault
@ kDefault
Definition: glm.hpp:61
genesis::utils::GlmOutput::fitted
std::vector< double > fitted
Fitted values (size N).
Definition: glm.hpp:128
genesis::utils::GlmExtras::with_intercept
bool with_intercept
Definition: glm.hpp:52
genesis::utils::GlmExtras::prior_weights
std::vector< double > prior_weights
Definition: glm.hpp:51
genesis::utils::GlmOutput::betaQ
std::vector< double > betaQ
Vector of parameter estimates (in terms of basis matrix, Xb) (size M).
Definition: glm.hpp:150
genesis::utils::GlmExtras::kDevianceResiduals
@ kDevianceResiduals
Definition: glm.hpp:63
genesis::utils::GlmOutput::resid
std::vector< double > resid
Working residuals (on linear predictor scale) (size N).
Definition: glm.hpp:133