A toolkit for working with phylogenetic data.
v0.24.0
glm.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2019 Lucas Czech and HITS gGmbH
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
31 /*
32  =================================================================
33  License
34  =================================================================
35 
36  The implementation is based on the snp.matrix and X.snp.matrix classes by
37  David Clayton <david.clayton@cimr.cam.ac.uk> and Hin-Tak Leung <htl10@users.sourceforge.net>.
38  The source is in C, but is originally intended for usage in R.
39  This file in particular is based on snpStats_1.32.0/src/glm_test.c
40  We massively refactored the original code, for example by using vectors and matrices instead
41  of pointers, and by using proper data structures instead of lists of in/out function parameters.
42  Furthermore, we added some new code for calculating additional statistical values such as the deviance.
43 
44  The original code is published under the GNU General Public Licence version 3 (GPLv3).
45  We use the same license, hence see above for details.
46 
47  The snp.matrix and X.snp.matrix classes.
48  Copyright (C) 2008 David Clayton and Hin-Tak Leung
49 
50  The package does not seem to be maintained any more, and does not seem to
51  have a proper repository. For more information, try these sites:
52  https://bioconductor.org/packages/release/bioc/html/snpStats.html
53  https://www.rdocumentation.org/packages/snpStats/
54  http://www-gene.cimr.cam.ac.uk/clayton/software/
55 */
56 
58 
63 
64 #include <algorithm>
65 #include <cassert>
66 #include <cmath>
67 #include <cstdio>
68 #include <cstdlib>
69 #include <cstring>
70 #include <functional>
71 #include <limits>
72 #include <stdexcept>
73 
74 namespace genesis {
75 namespace utils {
76 
77 // =================================================================================================
78 // Iteratively Reweighted Least Squares
79 // =================================================================================================
80 
81 static void glm_irls_(
82  Matrix<double> const& x_predictors,
83  std::vector<double> const& y_response,
84  GlmFamily const& family,
85  GlmLink const& link,
86  GlmExtras const& extras,
87  GlmControl const& control,
88  GlmOutput& result
89 ) {
90  // Some shortcuts.
91  auto const N = y_response.size();
92  auto const M = x_predictors.cols();
93 
94  // Already checked in main function. Assert here again for better overview.
95  assert( x_predictors.rows() == N );
96  assert( extras.prior_weights.empty() || extras.prior_weights.size() == N );
97  assert( extras.strata.empty() || extras.strata.size() == N );
98 
99  // Working data.
100  auto y_working = std::vector<double>( N );
101 
102  // Default scale factor
103  result.scale = 1.0;
104 
105  result.num_iterations = 0;
106  result.converged = false;
107  double logL_prev = 0.0;
108  while( result.num_iterations < control.max_iterations && ! result.converged ) {
109  for( size_t i = 0; i < N; i++ ) {
110  // TODO this used the family id instead of the link id. is that correct?
111  y_working[i] = result.resid[i] + link.link( result.fitted[i] );
112  }
113  auto freedom = weighted_mean_centering(
114  y_working, result.weights, extras.strata, extras.with_intercept, true, result.resid
115  );
116 
117  // Columns in Xb matrix
118  result.rank = 0;
119 
120  // Orthogonal basis matrix
121  size_t xb_col = 0;
122  auto xb_tmp = std::vector<double>();
123 
124  // Loop over columns of X matrix
125  size_t ii = 0;
126  size_t ij = 0;
127  for( size_t i = 0; i < M; ++i ) {
128  // Center
130  x_predictors.col(i), result.weights, extras.strata,
131  extras.with_intercept, true, xb_tmp
132  );
133  result.Xb.col( xb_col ) = xb_tmp;
134 
135  // Corrected SSQ
136  double const ssx = weighted_sum_of_squares(
137  result.Xb.col( xb_col ), result.weights
138  );
139  double ssr = ssx;
140 
141  // Regress on earlier columns
142  if( result.rank > 0 ) {
143  for( size_t j = 0; j < result.rank; ++j ) {
144 
145  // Coefficient
146  double bij = weighted_residuals(
147  result.Xb.col( j ),
148  result.Xb.col( xb_col ),
149  result.weights,
150  xb_tmp
151  );
152  result.Xb.col( xb_col ) = xb_tmp;
153 
154  // Save in off-diagonal elements of tri
155  result.tri[ij] = bij;
156  ++ij;
157  }
158  ssr = weighted_sum_of_squares( result.Xb.col( xb_col ), result.weights );
159  }
160 
161  // Check if greater than singularity threshold
162  if ((ssx > 0.0) && (ssr / ssx > 1.0 - control.max_r2)) {
163  double const bQi = weighted_residuals(
164  result.Xb.col( xb_col ),
165  result.resid,
166  result.weights,
167  result.resid
168  );
169 
170  ++result.rank;
171  ++xb_col;
172  assert(( xb_col < M ) xor ( i+1 == M ));
173 
174  // Diagonal elements of tri
175  result.tri[ij] = ssr;
176  result.which[ii] = i;
177  result.betaQ[ii] = bQi;
178  ++ii;
179  ++ij;
180  } else {
181 
182  // If singularity, drop off-diagonal elements of tri
183  ij -= result.rank;
184  }
185  }
186 
187  double wss = 0.0;
188  freedom.valid_entries = 0;
189  double logL = 0.0;
190  for( size_t i = 0; i < N; ++i ) {
191  double ri;
192  double wi;
193  double const mu = link.inverse_link( y_working[i] - result.resid[i] );
194  double const pi = extras.prior_weights.empty() ? 1.0 : extras.prior_weights[i];
195 
196  result.fitted[i] = family.rectify( mu );
197  logL += pi * family.log_likelihood( y_response[i], mu );
198 
199  if( !( pi != 0.0 && ( result.weights[i] > 0.0 ))) {
200  wi = 0.0;
201  ri = 0.0;
202  } else {
203  double const Vmu = family.variance( mu );
204  ++freedom.valid_entries;
205 
206  if( link.id == family.canonical_link_id ) {
207  wi = pi * Vmu;
208 
209  switch( extras.residual_type ) {
210  case GlmExtras::kDefault: {
211  ri = ( y_response[i] - mu ) / Vmu;
212  break;
213  }
215  ri = ( y_response[i] - mu ) / std::sqrt( Vmu );
216  break;
217  }
219  auto const ud = family.unit_deviance( y_response[i], mu );
220  ri = signum( y_response[i] - mu ) * std::sqrt( ud );
221  break;
222  }
223  default: {
224  throw std::invalid_argument( "Invalid residual type specified." );
225  }
226  }
227  } else {
228  double const D = link.derivative( mu );
229  wi = pi / (D * D * Vmu);
230  ri = D * (y_response[i] - mu);
231  }
232  wss += wi * ri * ri;
233  }
234  result.weights[i] = wi;
235  result.resid[i] = ri;
236  }
237 
238  long const dfr = freedom.degrees_of_freedom( result.rank );
239  result.df_resid = dfr > 0 ? static_cast<size_t>( dfr ) : 0;
240  if( family.id == GlmFamily::kGaussian || family.id == GlmFamily::kGamma ) {
241  result.scale = wss / (dfr);
242  }
243 
244  // Check for convergence and iterate if necessary.
245  if( result.num_iterations > 1 ) {
246  double dL = (logL - logL_prev) / (result.scale);
247  if( dL < control.epsilon ) {
248  result.converged = true;
249  }
250  }
251  logL_prev = logL;
252  ++result.num_iterations;
253  }
254 
255  for( size_t i = 0; i < N; i++ ) {
256  result.fitted[i] = link.inverse_link( y_working[i] - result.resid[i] );
257  }
258 }
259 
260 // =================================================================================================
261 // Simple Linear Gaussian Case
262 // =================================================================================================
263 
264 static void glm_gaussian_(
265  Matrix<double> const& x_predictors,
266  std::vector<double> const& y_response,
267  GlmExtras const& extras,
268  GlmControl const& control,
269  GlmFreedom const& freedom,
270  GlmOutput& result
271 ) {
272  // Some shortcuts.
273  auto const N = y_response.size();
274  auto const M = x_predictors.cols();
275 
276  // Already checked in main function. Assert here again for better overview.
277  assert( x_predictors.rows() == N );
278  assert( extras.strata.empty() || extras.strata.size() == N );
279 
280  size_t xb_col = 0;
281  auto xb_tmp = std::vector<double>();
282  result.rank = 0;
283 
284  size_t ii = 0;
285  size_t ij = 0;
286  for( size_t i = 0; i < M; i++ ) {
288  x_predictors.col(i), result.weights, extras.strata,
289  extras.with_intercept, true, xb_tmp
290  );
291  result.Xb.col( xb_col ) = xb_tmp;
292 
293  double ssx = weighted_sum_of_squares( result.Xb.col( xb_col ), result.weights );
294  double ssr = ssx;
295 
296  // Regress on earlier columns
297  if( result.rank > 0 ) {
298  for( size_t j = 0; j < result.rank; ++j ) {
299  double const bij = weighted_residuals(
300  result.Xb.col( j ),
301  result.Xb.col( xb_col ),
302  result.weights,
303  xb_tmp
304  );
305  result.Xb.col( xb_col ) = xb_tmp;
306 
307  // Off-diagonal
308  result.tri[ij] = bij;
309  ++ij;
310  }
311  ssr = weighted_sum_of_squares( result.Xb.col( xb_col ), result.weights );
312  }
313 
314  // Check if we are above the singularity threshold
315  if(( ssx > 0.0) && (ssr / ssx > 1.0 - control.max_r2 )) {
316  double const bQi = weighted_residuals(
317  result.Xb.col( xb_col ),
318  result.resid,
319  result.weights,
320  result.resid
321  );
322 
323  ++result.rank;
324  ++xb_col;
325 
326  // Diagonal
327  result.tri[ij] = ssr;
328  result.which[ii] = i;
329  result.betaQ[ii] = bQi;
330  ++ij;
331  ++ii;
332  } else {
333  // We have a singluarity. Drop the element.
334  ij -= result.rank;
335  }
336  }
337 
338  for( size_t i = 0; i < N; ++i ) {
339  result.fitted[i] = y_response[i] - result.resid[i];
340  }
341 
342  double const wss = weighted_sum_of_squares( result.resid, result.weights );
343  long const dfr = freedom.degrees_of_freedom( result.rank );
344  result.scale = wss / dfr;
345  result.df_resid = dfr > 0 ? static_cast<size_t>( dfr ) : 0;
346 
347  result.converged = true;
348  result.num_iterations = 0;
349 }
350 
351 // =================================================================================================
352 // Generalized Linear Model
353 // =================================================================================================
354 
356  Matrix<double> const& x_predictors,
357  std::vector<double> const& y_response,
358  GlmFamily const& family,
359  GlmLink const& link,
360  GlmExtras const& extras,
361  GlmControl const& control
362 ) {
363 
364  // Some shortcuts.
365  auto const N = y_response.size();
366  auto const M = x_predictors.cols();
367 
368  // Error checks.
369  if( x_predictors.rows() != N ) {
370  throw std::invalid_argument( "glm_fit: size of rows of x is not size of y." );
371  }
372  if( ! extras.initial_fittings.empty() && extras.initial_fittings.size() != N ) {
373  throw std::invalid_argument( "glm_fit: size of initial fittings is not size of y." );
374  }
375  assert( extras.initial_fittings.empty() || extras.initial_fittings.size() == N );
376  if( ! extras.prior_weights.empty() && extras.prior_weights.size() != N ) {
377  throw std::invalid_argument( "glm_fit: size of prior weights is not size of y." );
378  }
379  assert( extras.prior_weights.empty() || extras.prior_weights.size() == N );
380  if( ! extras.strata.empty() && extras.strata.size() != N ) {
381  throw std::invalid_argument( "glm_fit: size of strata is not size of y." );
382  }
383  assert( extras.strata.empty() || extras.strata.size() == N );
384  if( control.epsilon <= 0.0 || control.epsilon > 1.0 ) {
385  throw std::invalid_argument( "glm_fit: epsilon has to be in ( 0.0, 1.0 ]" );
386  }
387  if( control.max_r2 <= 0.0 || control.max_r2 >= 1.0 ) {
388  throw std::invalid_argument( "glm_fit: max_r2 has to be in ( 0.0, 1.0 )" );
389  }
390  if( ! is_defined( family )) {
391  throw std::invalid_argument( "glm_fit: family is not properly defined." );
392  }
393  if( ! is_defined( link )) {
394  throw std::invalid_argument( "glm_fit: link is not properly defined." );
395  }
396 
397  // TODO interactions
398 
399  // Prepare results.
400  GlmOutput result;
401  result.Xb = Matrix<double>( N, M );
402  result.fitted = std::vector<double>( N );
403  result.resid = std::vector<double>( N );
404  result.weights = std::vector<double>( N );
405  result.which = std::vector<double>( M );
406  result.betaQ = std::vector<double>( M );
407  result.tri = std::vector<double>( (M * ( M+1 )) / 2 );
408 
409  // Is iteration necessary?
410  bool const irls = (M > 0) && !(
411  ( family.id == GlmFamily::kGaussian ) && ( link.id == GlmLink::kIdentity )
412  );
413 
414  // Initialize the fittings.
415  GlmFreedom freedom;
416  if( extras.initial_fittings.empty() || ! irls ) {
417  // Fit intercept and/or strata part of model,
418  // that is, set the fitted values to the (strata) (weighted) mean of the y values.
419  freedom = weighted_mean_centering(
420  y_response, extras.prior_weights, extras.strata, extras.with_intercept, false, result.fitted
421  );
422  } else {
423  assert( irls );
424  assert( extras.initial_fittings.size() == N );
425  result.fitted = extras.initial_fittings;
426  }
427 
428  // Prepare residuals and weights, and calculate null deviance.
429  freedom.valid_entries = 0;
430  assert( result.null_deviance == 0.0 );
431  for( size_t i = 0; i < N; i++ ) {
432  double const mu = result.fitted[i];
433  double const pi = extras.prior_weights.empty() ? 1.0 : extras.prior_weights[i];
434 
435  // Null deviance
436  auto const ud = family.unit_deviance( y_response[i], mu );
437  if( std::isfinite( ud )) {
438  result.null_deviance += ud;
439  }
440 
441  // Residuals and weights.
442  if(( ! std::isfinite(pi) ) || ( pi < 0.0 )) {
443  throw std::invalid_argument( "glm_fit: prior weights have to be non-negative." );
444  } else if( pi == 0.0 ) {
445  result.resid[i] = 0.0;
446  result.weights[i] = 0.0;
447  } else {
448  assert( std::isfinite( pi ) || ( pi > 0.0 ));
449  ++freedom.valid_entries;
450 
451  double const Vmu = family.variance( mu );
452  if( link.id == family.canonical_link_id ) {
453  switch( extras.residual_type ) {
454  case GlmExtras::kDefault: {
455  result.resid[i] = ( y_response[i] - mu ) / Vmu;
456  break;
457  }
459  result.resid[i] = ( y_response[i] - mu ) / std::sqrt( Vmu );
460  break;
461  }
463  result.resid[i] = signum( y_response[i] - mu ) * std::sqrt( ud );
464  break;
465  }
466  default: {
467  throw std::invalid_argument( "Invalid residual type specified." );
468  }
469  }
470  result.weights[i] = pi * Vmu;
471  } else {
472  double const D = link.derivative( mu );
473  result.resid[i] = D * (y_response[i] - mu);
474  result.weights[i] = pi / (D * D * Vmu);
475  }
476  }
477  }
478  if( extras.mean_deviance ) {
479  result.null_deviance /= static_cast<double>(N);
480  }
481 
482  // If X has data, include covariates
483  if( M > 0 ) {
484 
485  // IRLS algorithm, or simple linear Gaussian case
486  if( irls ) {
487  glm_irls_( x_predictors, y_response, family, link, extras, control, result );
488  } else {
489  glm_gaussian_( x_predictors, y_response, extras, control, freedom, result );
490  }
491 
492  // Calcualte deviance.
493  assert( result.deviance == 0.0 );
494  for( size_t i = 0; i < N; i++ ) {
495  auto const ud = family.unit_deviance( y_response[i], result.fitted[i] );
496  if( std::isfinite( ud )) {
497  result.deviance += ud;
498  }
499  }
500  if( extras.mean_deviance ) {
501  result.deviance /= static_cast<double>(N);
502  }
503 
504  } else {
505 
506  // No covariates
507  auto const dfr = freedom.degrees_of_freedom( 0 );
508  if( family.id == GlmFamily::kGaussian || family.id == GlmFamily::kGamma ) {
509  result.scale = weighted_sum_of_squares( result.resid, result.weights ) / (dfr);
510  } else {
511  result.scale = 1.0;
512  }
513  result.df_resid = dfr > 0 ? static_cast<size_t>( dfr ) : 0;
514  }
515 
516  return result;
517 }
518 
520  Matrix<double> const& x_predictors,
521  std::vector<double> const& y_response,
522  GlmFamily const& family,
523  GlmExtras const& extras,
524  GlmControl const& control
525 ) {
526  if( ! family.canonical_link ) {
527  throw std::runtime_error( "glm_fit: family does not provide a canonical link." );
528  }
529  return glm_fit( x_predictors, y_response, family, family.canonical_link(), extras, control );
530 }
531 
533  Matrix<double> const& x_predictors,
534  std::vector<double> const& y_response,
535  GlmExtras const& extras,
536  GlmControl const& control
537 ) {
538  auto family = glm_family_gaussian();
539  return glm_fit( x_predictors, y_response, family, family.canonical_link(), extras, control );
540 }
541 
542 } // namespace utils
543 } // namespace genesis
std::vector< double > prior_weights
Definition: glm.hpp:50
std::vector< double > which
Which columns in the X matrix were estimated (first = 0) (size M).
Definition: glm.hpp:142
double weighted_sum_of_squares(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of squares.
double weighted_residuals(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< double > &y_output)
Calculate the residuals from (weighted) regression through the origin.
std::function< double(double y, double mu)> unit_deviance
Unit deviance for the distribution family.
Definition: family.hpp:90
std::vector< double > tri
Upper unit triangular transformation matrix, with Xb - tr.Xb placed in the diagonal (size (M * (M+1))...
Definition: glm.hpp:153
Internal helper structure for GLMs to calcualte the residual degrees of freedom.
GlmFamily glm_family_gaussian()
Gaussian/normal family functions.
Definition: family.hpp:226
ResidualType residual_type
Definition: glm.hpp:65
size_t valid_entries
Number of valid priors (Nu).
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Definition: common.hpp:81
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
bool is_defined(GlmFamily const &family)
Check whether all necessary values and functors of a GlmFamily are set.
Definition: family.hpp:101
size_t max_iterations
Maximum number of iterations to run the IRLS algorithm for (if needed).
Definition: glm.hpp:83
double scale
Scale factor (scalar).
Definition: glm.hpp:117
std::vector< double > weights
Weights (size N)
Definition: glm.hpp:137
std::vector< size_t > strata
Strata assignments coded 1...S.
Definition: glm.hpp:56
GlmFreedom weighted_mean_centering(std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< size_t > const &strata, bool with_intercept, bool centering, std::vector< double > &y_output)
(Weighted) mean and centering.
Matrix< double > Xb
Orthogonal basis for X space (N * M matrix, with N * rank being used).
Definition: glm.hpp:122
MatrixCol< self_type, value_type > col(size_t col)
std::function< double(double mu)> variance
Variance function for the distribution family.
Definition: family.hpp:75
std::vector< double > initial_fittings
Definition: glm.hpp:49
std::function< GlmLink()> canonical_link
Get the canonical link function.
Definition: family.hpp:95
static void glm_gaussian_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmExtras const &extras, GlmControl const &control, GlmFreedom const &freedom, GlmOutput &result)
Definition: glm.cpp:264
static void glm_irls_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control, GlmOutput &result)
Definition: glm.cpp:81
std::vector< double > resid
Working residuals (on linear predictor scale) (size N).
Definition: glm.hpp:132
double max_r2
Threshold for singluarities. Internally used as eta = 1.0 - max_r2.
Definition: glm.hpp:96
std::function< double(double mu)> rectify
Rectify to a valid value, for the fitted mean, to avoid extreme predictions.
Definition: family.hpp:85
Provides easy and fast logging functionality.
double epsilon
Proportional change in weighted sum of squares residuals to declare convergence between two iteration...
Definition: glm.hpp:89
GlmOutput glm_fit(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control)
Fit a Generalized Linear Model (GLM).
Definition: glm.cpp:355
long degrees_of_freedom(size_t rank) const
Calculate the degrees of freedom (dfr).
GlmLink::Link canonical_link_id
Internal ID of the GlmLink, used to check if the link is the canonical one for a given distribution f...
Definition: family.hpp:70
std::function< double(double y, double mu)> log_likelihood
Log-Likelihood contribution of a value. To be multiplied by prior weight.
Definition: family.hpp:80
std::vector< double > betaQ
Vector of parameter estimates (in terms of basis matrix, Xb) (size M).
Definition: glm.hpp:147
size_t rank
Rank of X after regression on strata.
Definition: glm.hpp:107
bool mean_deviance
Calculate mean null_deviance and mean deviance instead of their sums.
Definition: glm.hpp:75
Family id
Internal ID of the GlmFamily, used to check for specific families where needed.
Definition: family.hpp:64
size_t df_resid
Residual degrees of freedom.
Definition: glm.hpp:112
std::vector< double > fitted
Fitted values (size N).
Definition: glm.hpp:127