A library for working with phylogenetic and population genetic data.
v0.32.0
glm.cpp
Go to the documentation of this file.
1 /*
2  Genesis - A toolkit for working with phylogenetic data.
3  Copyright (C) 2014-2024 Lucas Czech
4 
5  This program is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 
18  Contact:
19  Lucas Czech <lucas.czech@h-its.org>
20  Exelixis Lab, Heidelberg Institute for Theoretical Studies
21  Schloss-Wolfsbrunnenweg 35, D-69118 Heidelberg, Germany
22 */
23 
31 /*
32  =================================================================
33  License
34  =================================================================
35 
36  The implementation is based on the snp.matrix and X.snp.matrix classes by
37  David Clayton <david.clayton@cimr.cam.ac.uk> and Hin-Tak Leung <htl10@users.sourceforge.net>.
38  The source is in C, but is originally intended for usage in R.
39  This file in particular is based on snpStats_1.32.0/src/glm_test.c
40  We massively refactored the original code, for example by using vectors and matrices instead
41  of pointers, and by using proper data structures instead of lists of in/out function parameters.
42  Furthermore, we added some new code for calculating additional statistical values such as the deviance.
43 
44  The original code is published under the GNU General Public Licence version 3 (GPLv3).
45  We use the same license, hence see above for details.
46 
47  The snp.matrix and X.snp.matrix classes.
48  Copyright (C) 2008 David Clayton and Hin-Tak Leung
49 
50  The package does not seem to be maintained any more, and does not seem to
51  have a proper repository. For more information, try these sites:
52  https://bioconductor.org/packages/release/bioc/html/snpStats.html
53  https://www.rdocumentation.org/packages/snpStats/
54  http://www-gene.cimr.cam.ac.uk/clayton/software/
55 */
56 
58 
63 
64 #include <algorithm>
65 #include <cassert>
66 #include <cmath>
67 #include <cstdio>
68 #include <cstdlib>
69 #include <cstring>
70 #include <functional>
71 #include <limits>
72 #include <stdexcept>
73 
74 namespace genesis {
75 namespace utils {
76 
77 // =================================================================================================
78 // Iteratively Reweighted Least Squares
79 // =================================================================================================
80 
81 static void glm_irls_(
82  Matrix<double> const& x_predictors,
83  std::vector<double> const& y_response,
84  GlmFamily const& family,
85  GlmLink const& link,
86  GlmExtras const& extras,
87  GlmControl const& control,
88  GlmOutput& result
89 ) {
90  // Some shortcuts.
91  auto const N = y_response.size();
92  auto const M = x_predictors.cols();
93 
94  // Already checked in main function. Assert here again for better overview.
95  assert( x_predictors.rows() == N );
96  assert( extras.prior_weights.empty() || extras.prior_weights.size() == N );
97  assert( extras.strata.empty() || extras.strata.size() == N );
98 
99  // Working data.
100  auto y_working = std::vector<double>( N );
101 
102  // Default scale factor
103  result.scale = 1.0;
104 
105  result.num_iterations = 0;
106  result.converged = false;
107  double logL_prev = 0.0;
108  while( result.num_iterations < control.max_iterations && ! result.converged ) {
109  for( size_t i = 0; i < N; i++ ) {
110  // TODO this used the family id instead of the link id. is that correct?
111  y_working[i] = result.resid[i] + link.link( result.fitted[i] );
112  }
113  auto freedom = weighted_mean_centering(
114  y_working, result.weights, extras.strata, extras.with_intercept, true, result.resid
115  );
116 
117  // Columns in Xb matrix
118  result.rank = 0;
119 
120  // Orthogonal basis matrix
121  size_t xb_col = 0;
122  auto xb_tmp = std::vector<double>();
123 
124  // Loop over columns of X matrix
125  size_t ii = 0;
126  size_t ij = 0;
127  for( size_t i = 0; i < M; ++i ) {
128  // Center
130  x_predictors.col(i), result.weights, extras.strata,
131  extras.with_intercept, true, xb_tmp
132  );
133  result.Xb.col( xb_col ) = xb_tmp;
134 
135  // Corrected SSQ
136  double const ssx = weighted_sum_of_squares(
137  result.Xb.col( xb_col ), result.weights
138  );
139  double ssr = ssx;
140 
141  // Regress on earlier columns
142  if( result.rank > 0 ) {
143  for( size_t j = 0; j < result.rank; ++j ) {
144 
145  // Coefficient
146  double bij = weighted_residuals(
147  result.Xb.col( j ),
148  result.Xb.col( xb_col ),
149  result.weights,
150  xb_tmp
151  );
152  result.Xb.col( xb_col ) = xb_tmp;
153 
154  // Save in off-diagonal elements of tri
155  result.tri[ij] = bij;
156  ++ij;
157  }
158  ssr = weighted_sum_of_squares( result.Xb.col( xb_col ), result.weights );
159  }
160 
161  // Check if greater than singularity threshold
162  if ((ssx > 0.0) && (ssr / ssx > 1.0 - control.max_r2)) {
163  double const bQi = weighted_residuals(
164  result.Xb.col( xb_col ),
165  result.resid,
166  result.weights,
167  result.resid
168  );
169 
170  ++result.rank;
171  ++xb_col;
172  assert(( xb_col < M ) xor ( i+1 == M ));
173 
174  // Diagonal elements of tri
175  result.tri[ij] = ssr;
176  result.which[ii] = i;
177  result.betaQ[ii] = bQi;
178  ++ii;
179  ++ij;
180  } else {
181 
182  // If singularity, drop off-diagonal elements of tri
183  ij -= result.rank;
184  }
185  }
186 
187  double wss = 0.0;
188  freedom.valid_entries = 0;
189  double logL = 0.0;
190  for( size_t i = 0; i < N; ++i ) {
191  double ri;
192  double wi;
193  double const mu = link.inverse_link( y_working[i] - result.resid[i] );
194  double const pi = extras.prior_weights.empty() ? 1.0 : extras.prior_weights[i];
195 
196  result.fitted[i] = family.rectify( mu );
197  logL += pi * family.log_likelihood( y_response[i], mu );
198 
199  if( !( pi != 0.0 && ( result.weights[i] > 0.0 ))) {
200  wi = 0.0;
201  ri = 0.0;
202  } else {
203  double const Vmu = family.variance( mu );
204  ++freedom.valid_entries;
205 
206  if( link.id == family.canonical_link_id ) {
207  wi = pi * Vmu;
208 
209  switch( extras.residual_type ) {
210  case GlmExtras::kDefault: {
211  ri = ( y_response[i] - mu ) / Vmu;
212  break;
213  }
215  ri = ( y_response[i] - mu ) / std::sqrt( Vmu );
216  break;
217  }
219  auto const ud = family.unit_deviance( y_response[i], mu );
220  ri = signum( y_response[i] - mu ) * std::sqrt( ud );
221  break;
222  }
223  default: {
224  throw std::invalid_argument( "Invalid residual type specified." );
225  }
226  }
227  } else {
228  double const D = link.derivative( mu );
229  wi = pi / (D * D * Vmu);
230  ri = D * (y_response[i] - mu);
231  }
232  wss += wi * ri * ri;
233  }
234  result.weights[i] = wi;
235  result.resid[i] = ri;
236  }
237 
238  long const dfr = freedom.degrees_of_freedom( result.rank );
239  result.df_resid = dfr > 0 ? static_cast<size_t>( dfr ) : 0;
240  if( family.id == GlmFamily::kGaussian || family.id == GlmFamily::kGamma ) {
241  result.scale = wss / (dfr);
242  }
243 
244  // Check for convergence and iterate if necessary.
245  if( result.num_iterations > 1 ) {
246  double dL = (logL - logL_prev) / (result.scale);
247  if( dL < control.epsilon ) {
248  result.converged = true;
249  }
250  }
251  logL_prev = logL;
252  ++result.num_iterations;
253  }
254 
255  for( size_t i = 0; i < N; i++ ) {
256  result.fitted[i] = link.inverse_link( y_working[i] - result.resid[i] );
257  }
258 }
259 
260 // =================================================================================================
261 // Simple Linear Gaussian Case
262 // =================================================================================================
263 
264 static void glm_gaussian_(
265  Matrix<double> const& x_predictors,
266  std::vector<double> const& y_response,
267  GlmExtras const& extras,
268  GlmControl const& control,
269  GlmFreedom const& freedom,
270  GlmOutput& result
271 ) {
272  // Some shortcuts.
273  auto const N = y_response.size();
274  auto const M = x_predictors.cols();
275 
276  // Already checked in main function. Assert here again for better overview.
277  assert( x_predictors.rows() == N );
278  assert( extras.strata.empty() || extras.strata.size() == N );
279 
280  size_t xb_col = 0;
281  auto xb_tmp = std::vector<double>();
282  result.rank = 0;
283 
284  size_t ii = 0;
285  size_t ij = 0;
286  for( size_t i = 0; i < M; i++ ) {
288  x_predictors.col(i), result.weights, extras.strata,
289  extras.with_intercept, true, xb_tmp
290  );
291  result.Xb.col( xb_col ) = xb_tmp;
292 
293  double ssx = weighted_sum_of_squares( result.Xb.col( xb_col ), result.weights );
294  double ssr = ssx;
295 
296  // Regress on earlier columns
297  if( result.rank > 0 ) {
298  for( size_t j = 0; j < result.rank; ++j ) {
299  double const bij = weighted_residuals(
300  result.Xb.col( j ),
301  result.Xb.col( xb_col ),
302  result.weights,
303  xb_tmp
304  );
305  result.Xb.col( xb_col ) = xb_tmp;
306 
307  // Off-diagonal
308  result.tri[ij] = bij;
309  ++ij;
310  }
311  ssr = weighted_sum_of_squares( result.Xb.col( xb_col ), result.weights );
312  }
313 
314  // Check if we are above the singularity threshold
315  if(( ssx > 0.0) && (ssr / ssx > 1.0 - control.max_r2 )) {
316  double const bQi = weighted_residuals(
317  result.Xb.col( xb_col ),
318  result.resid,
319  result.weights,
320  result.resid
321  );
322 
323  ++result.rank;
324  ++xb_col;
325 
326  // Diagonal
327  result.tri[ij] = ssr;
328  result.which[ii] = i;
329  result.betaQ[ii] = bQi;
330  ++ij;
331  ++ii;
332  } else {
333  // We have a singluarity. Drop the element.
334  ij -= result.rank;
335  }
336  }
337 
338  for( size_t i = 0; i < N; ++i ) {
339  result.fitted[i] = y_response[i] - result.resid[i];
340  }
341 
342  double const wss = weighted_sum_of_squares( result.resid, result.weights );
343  long const dfr = freedom.degrees_of_freedom( result.rank );
344  result.scale = wss / dfr;
345  result.df_resid = dfr > 0 ? static_cast<size_t>( dfr ) : 0;
346 
347  result.converged = true;
348  result.num_iterations = 0;
349 }
350 
351 // =================================================================================================
352 // Generalized Linear Model
353 // =================================================================================================
354 
356  Matrix<double> const& x_predictors,
357  std::vector<double> const& y_response,
358  GlmFamily const& family,
359  GlmLink const& link,
360  GlmExtras const& extras,
361  GlmControl const& control
362 ) {
363 
364  // Some shortcuts.
365  auto const N = y_response.size();
366  auto const M = x_predictors.cols();
367 
368  // Error checks.
369  if( x_predictors.rows() != N ) {
370  throw std::invalid_argument( "glm_fit: size of rows of x is not size of y." );
371  }
372  if( ! extras.initial_fittings.empty() && extras.initial_fittings.size() != N ) {
373  throw std::invalid_argument( "glm_fit: size of initial fittings is not size of y." );
374  }
375  assert( extras.initial_fittings.empty() || extras.initial_fittings.size() == N );
376  if( ! extras.prior_weights.empty() && extras.prior_weights.size() != N ) {
377  throw std::invalid_argument( "glm_fit: size of prior weights is not size of y." );
378  }
379  assert( extras.prior_weights.empty() || extras.prior_weights.size() == N );
380  if( ! extras.strata.empty() && extras.strata.size() != N ) {
381  throw std::invalid_argument( "glm_fit: size of strata is not size of y." );
382  }
383  assert( extras.strata.empty() || extras.strata.size() == N );
384  if( control.epsilon <= 0.0 || control.epsilon > 1.0 ) {
385  throw std::invalid_argument( "glm_fit: epsilon has to be in ( 0.0, 1.0 ]" );
386  }
387  if( control.max_r2 <= 0.0 || control.max_r2 >= 1.0 ) {
388  throw std::invalid_argument( "glm_fit: max_r2 has to be in ( 0.0, 1.0 )" );
389  }
390  if( ! is_defined( family )) {
391  throw std::invalid_argument( "glm_fit: family is not properly defined." );
392  }
393  if( ! is_defined( link )) {
394  throw std::invalid_argument( "glm_fit: link is not properly defined." );
395  }
396 
397  // TODO interactions
398 
399  // Prepare results.
400  GlmOutput result;
401  result.Xb = Matrix<double>( N, M );
402  result.fitted = std::vector<double>( N );
403  result.resid = std::vector<double>( N );
404  result.weights = std::vector<double>( N );
405  result.which = std::vector<double>( M );
406  result.betaQ = std::vector<double>( M );
407  result.tri = std::vector<double>( (M * ( M+1 )) / 2 );
408 
409  // Is iteration necessary?
410  bool const irls = (M > 0) && !(
411  ( family.id == GlmFamily::kGaussian ) && ( link.id == GlmLink::kIdentity )
412  );
413 
414  // Initialize the fittings.
415  GlmFreedom freedom;
416  if( extras.initial_fittings.empty() || ! irls ) {
417  // Fit intercept and/or strata part of model,
418  // that is, set the fitted values to the (strata) (weighted) mean of the y values.
419  freedom = weighted_mean_centering(
420  y_response, extras.prior_weights, extras.strata, extras.with_intercept, false, result.fitted
421  );
422  } else {
423  assert( irls );
424  assert( extras.initial_fittings.size() == N );
425  result.fitted = extras.initial_fittings;
426  }
427 
428  // Prepare residuals and weights, and calculate null deviance.
429  freedom.valid_entries = 0;
430  assert( result.null_deviance == 0.0 );
431  for( size_t i = 0; i < N; i++ ) {
432  double const mu = result.fitted[i];
433  double const pi = extras.prior_weights.empty() ? 1.0 : extras.prior_weights[i];
434 
435  // Null deviance
436  auto const ud = family.unit_deviance( y_response[i], mu );
437  if( std::isfinite( ud )) {
438  result.null_deviance += ud;
439  }
440 
441  // Residuals and weights.
442  if(( ! std::isfinite(pi) ) || ( pi < 0.0 )) {
443  throw std::invalid_argument( "glm_fit: prior weights have to be non-negative." );
444  } else if( pi == 0.0 ) {
445  result.resid[i] = 0.0;
446  result.weights[i] = 0.0;
447  } else {
448  assert( std::isfinite( pi ) || ( pi > 0.0 ));
449  ++freedom.valid_entries;
450 
451  double const Vmu = family.variance( mu );
452  if( link.id == family.canonical_link_id ) {
453  switch( extras.residual_type ) {
454  case GlmExtras::kDefault: {
455  result.resid[i] = ( y_response[i] - mu ) / Vmu;
456  break;
457  }
459  result.resid[i] = ( y_response[i] - mu ) / std::sqrt( Vmu );
460  break;
461  }
463  result.resid[i] = signum( y_response[i] - mu ) * std::sqrt( ud );
464  break;
465  }
466  default: {
467  throw std::invalid_argument( "Invalid residual type specified." );
468  }
469  }
470  result.weights[i] = pi * Vmu;
471  } else {
472  double const D = link.derivative( mu );
473  result.resid[i] = D * (y_response[i] - mu);
474  result.weights[i] = pi / (D * D * Vmu);
475  }
476  }
477  }
478  if( extras.mean_deviance ) {
479  result.null_deviance /= static_cast<double>(N);
480  }
481 
482  // If X has data, include covariates
483  if( M > 0 ) {
484 
485  // IRLS algorithm, or simple linear Gaussian case
486  if( irls ) {
487  glm_irls_( x_predictors, y_response, family, link, extras, control, result );
488  } else {
489  glm_gaussian_( x_predictors, y_response, extras, control, freedom, result );
490  }
491 
492  // Calcualte deviance.
493  assert( result.deviance == 0.0 );
494  for( size_t i = 0; i < N; i++ ) {
495  auto const ud = family.unit_deviance( y_response[i], result.fitted[i] );
496  if( std::isfinite( ud )) {
497  result.deviance += ud;
498  }
499  }
500  if( extras.mean_deviance ) {
501  result.deviance /= static_cast<double>(N);
502  }
503 
504  } else {
505 
506  // No covariates
507  auto const dfr = freedom.degrees_of_freedom( 0 );
508  if( family.id == GlmFamily::kGaussian || family.id == GlmFamily::kGamma ) {
509  result.scale = weighted_sum_of_squares( result.resid, result.weights ) / (dfr);
510  } else {
511  result.scale = 1.0;
512  }
513  result.df_resid = dfr > 0 ? static_cast<size_t>( dfr ) : 0;
514  }
515 
516  return result;
517 }
518 
520  Matrix<double> const& x_predictors,
521  std::vector<double> const& y_response,
522  GlmFamily const& family,
523  GlmExtras const& extras,
524  GlmControl const& control
525 ) {
526  if( ! family.canonical_link ) {
527  throw std::runtime_error( "glm_fit: family does not provide a canonical link." );
528  }
529  return glm_fit( x_predictors, y_response, family, family.canonical_link(), extras, control );
530 }
531 
533  Matrix<double> const& x_predictors,
534  std::vector<double> const& y_response,
535  GlmExtras const& extras,
536  GlmControl const& control
537 ) {
538  auto family = glm_family_gaussian();
539  return glm_fit( x_predictors, y_response, family, family.canonical_link(), extras, control );
540 }
541 
542 // =================================================================================================
543 // Output
544 // =================================================================================================
545 
549 std::vector<double> glm_inv_tri_( std::vector<double> const& tri, size_t M )
550 {
551  if( tri.size() != (M * ( M+1 )) / 2 ) {
552  throw std::invalid_argument(
553  "glm_inv_tri_(): Input tri vector is expected to have tridiagonal form."
554  );
555  }
556 
557  auto result = tri; // std::vector<double>( M, 0 );
558  for( size_t j = 0, ij = 0; j < M; j++ ) {
559  for( size_t i = 0, ii1 = 1; i < j; i++, ii1 += (i+2) ) {
560  double w = tri[ij];
561  for( size_t k = i+1, kj = ij+1, ik = ii1; k < j; k++, kj++, ik += k ) {
562  w += result[ik]*tri[kj];
563  }
564  result[ij++] = (-w);
565  }
566  double diag = tri[ij];
567  if( diag <= 0.0 ) {
568  std::runtime_error(
569  "glm_inv_tri_(): negative diagonal with j==" + std::to_string(j) +
570  ", ij==" + std::to_string(ij) + ", diag==" + std::to_string(diag)
571  );
572  }
573  result[ij++] = 1/diag;
574  }
575  return result;
576 }
577 
582 std::vector<double> glm_estimate_betas_inv_tri_(
583  GlmOutput const& output,
584  std::vector<double> const& inv_tri
585 ) {
586  assert( inv_tri.size() == output.tri.size() );
587  if( output.betaQ.size() != output.Xb.cols() ) {
588  throw std::invalid_argument(
589  "Invalid size of betaQ for computing glm_estimate_betas()"
590  );
591  }
592 
593  auto const M = output.Xb.cols();
594  auto betas = std::vector<double>( M );
595  for( size_t i = 0, ijs = 1; i < M; i++, ijs += (2+i) ) {
596  double w = output.betaQ[i];
597  for( size_t j = i+1, ij = ijs; j < M; j++, ij += j ) {
598  w += output.betaQ[j] * inv_tri[ij];
599  }
600  betas[i] = w;
601  }
602  return betas;
603 }
604 
605 std::vector<double> glm_estimate_betas( GlmOutput const& output )
606 {
607  auto const inv_tri = glm_inv_tri_( output.tri, output.Xb.cols() );
608  return glm_estimate_betas_inv_tri_( output, inv_tri );
609 }
610 
618 std::vector<double> udu_transpose_( size_t M, std::vector<double> const& U, double scale )
619 {
620  assert( U.size() == (M * ( M+1 )) / 2 );
621 
622  // TODO the below code is wrong and accesses an element ik beyond the input array.
623  // It's a mess of indexing magic, and I understand what it is trying to do,
624  // but lack the time to dive into why it is not doing that exactly.
625  // Furthermore, with our test case, we get a symmetric, but not positive definite
626  // matrix inv_tri, which means that this decomposition here is invalid anyway.
627  // As that test is based on some very clean numbers, it probably is worse on more
628  // messy data. Hence, for now, this will not be properly implemented, until
629  // someone actually needs this.
630  assert( false );
631 
632  auto result = std::vector<double>( U.size() );
633  for( size_t j = 0, ij = 0, jj = 0; j < M; j++, jj += (1 + j) ) {
634  for( size_t i = 0; i <= j; i++, ij++ ) {
635  double w = 0.0;
636  for(
637  size_t k = j, jk = jj, ik = jj - i + j, kk = jj;
638  k < M;
639  k++, kk += (1 + k), jk += k, ik += k
640  ) {
641  double Uik = (i == k) ? 1.0 : U[ik];
642  double Ujk = (j == k) ? 1.0 : U[jk];
643  double Dk = U[kk];
644  w += Uik * Ujk * Dk;
645  }
646  result[ij] = scale * w;
647  }
648  }
649  return result;
650 }
651 
659 std::vector<double> udvdu_transpose_(
660  size_t M, std::vector<double> const& U, std::vector<double> const& V, double scale
661 ) {
662  assert( U.size() == (M * ( M+1 )) / 2 );
663 
664  auto result = std::vector<double>( U.size() );
665  for( size_t j = 0, ij = 0, jj = 0; j < M; jj += (2 + (j++)) ) {
666  for( size_t i = 0, ii = 0; i <= j; ii += (2 + (i++)), ij++ ) {
667  double w = 0.0;
668  for(
669  size_t v = j, vv = jj, jv = jj, uv = ij;
670  v < M;
671  v++, vv += (2 + v), jv += v
672  ) {
673  double Ujv = (v == j) ? 1.0 : U[jv];
674  double Dv = U[vv];
675  for( size_t u = i, uu = ii, iu = ii; u < M; u++, uu += (2 + u), iu += u ) {
676  double Uiu = (u == i) ? 1.0 : U[iu];
677  double Du = U[uu];
678  w += Du * Dv * Uiu * Ujv * V[uv];
679  if( u < v ) {
680  uv++;
681  } else {
682  uv += (1 + u);
683  }
684  }
685  uv = vv + i + 1;
686  }
687  result[ij] = scale * w;
688  }
689  }
690  return result;
691 }
692 
693 std::pair<std::vector<double>, std::vector<double>> glm_estimate_betas_and_var_covar(
694  GlmOutput const& output,
695  std::vector<double> const& meat
696 ) {
697  throw std::runtime_error( "glm_estimate_betas_and_var_covar() not implemented" );
698 
699  auto const M = output.Xb.cols();
700  auto const inv_tri = glm_inv_tri_( output.tri, M );
701  auto const betas = glm_estimate_betas_inv_tri_( output, inv_tri );
702  assert( betas.size() == M );
703 
704  std::vector<double> vars_covars;
705  if( meat.empty() ) {
706  vars_covars = udu_transpose_( M, inv_tri, output.scale );
707  } else {
708  vars_covars = udvdu_transpose_( M, inv_tri, meat, output.scale );
709  }
710  assert( vars_covars.size() == (M * ( M+1 )) / 2 );
711  return { betas, vars_covars };
712 }
713 
715  Matrix<double> const& x_predictors,
716  std::vector<double> const& y_response,
717  GlmOutput const& output,
718  std::vector<double> const& betas
719 ) {
720  return glm_estimate_intercept(
721  x_predictors, y_response, glm_link_identity(), output, betas
722  );
723 }
724 
726  Matrix<double> const& x_predictors,
727  std::vector<double> const& y_response,
728  GlmLink const& link,
729  GlmOutput const& output,
730  std::vector<double> const& betas
731 ) {
732  if( betas.size() != x_predictors.cols() ) {
733  throw std::invalid_argument(
734  "Invalid size of betas for computing glm_estimate_intercept()"
735  );
736  }
737 
738  // TODO Do we need to account for strata here as well?
739  // As far as I could figure this out, the estimated betaQ values are computed across strata
740  // andyway, so then the intercept would be as well? Meaning that we do not need to take
741  // strata into account here again. But not sure.
742 
743  // We compute the weighted averages of the y_response and each of the x_predictors,
744  // using the weights as determined by glm_fit(). Then, we compute the sum of the products
745  // of the betas with the weighted averages of the predictors. That, subtracted from the
746  // response, is our intercept.
747 
748  // First we get the sum of the weights themselves.
749  // We just "misuse" the weighted sum here to sum the weights themselves.
750  auto const weight_sum = weighted_sum( output.weights );
751  assert( std::isfinite( weight_sum ));
752 
753  // We first need to translate our response into the link space.
754  auto y_response_transformed = y_response;
755  for( auto& y : y_response_transformed ) {
756  y = link.link(y);
757  }
758 
759  // Now compute the weighted sum of the y_response_transformed, and divide by the weight sum,
760  // i.e., compute the weighted average of the response.
761  auto const y_avg = weighted_sum( y_response_transformed, output.weights ) / weight_sum;
762  assert( std::isfinite( y_avg ));
763 
764  // Compute our final result by subtracting product of the beta values with the sum of the
765  // weighted average of each column of x_predictors from the y_response_transformed average.
766  double result = y_avg;
767  for( size_t i = 0; i < x_predictors.cols(); ++i ) {
768  auto const x_col_avg = weighted_sum( x_predictors.col(i), output.weights ) / weight_sum;
769  assert( std::isfinite( x_col_avg ));
770  result -= betas[i] * x_col_avg;
771  }
772  return result;
773 }
774 
775 std::vector<double> glm_coefficients(
776  Matrix<double> const& x_predictors,
777  std::vector<double> const& y_response,
778  GlmOutput const& output
779 ) {
780  return glm_coefficients( x_predictors, y_response, glm_link_identity(), output );
781 }
782 
783 std::vector<double> glm_coefficients(
784  Matrix<double> const& x_predictors,
785  std::vector<double> const& y_response,
786  GlmLink const& link,
787  GlmOutput const& output
788 ) {
789  auto coeffs = glm_estimate_betas( output );
790  auto intercept = glm_estimate_intercept( x_predictors, y_response, link, output, coeffs );
791  coeffs.insert( coeffs.begin(), intercept );
792  return coeffs;
793 }
794 
795 } // namespace utils
796 } // namespace genesis
genesis::utils::GlmExtras::mean_deviance
bool mean_deviance
Calculate mean null_deviance and mean deviance instead of their sums.
Definition: glm.hpp:76
genesis::utils::glm_gaussian_
static void glm_gaussian_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmExtras const &extras, GlmControl const &control, GlmFreedom const &freedom, GlmOutput &result)
Definition: glm.cpp:264
genesis::utils::GlmExtras::kPearsonResiduals
@ kPearsonResiduals
Definition: glm.hpp:62
genesis::utils::glm_coefficients
std::vector< double > glm_coefficients(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmOutput const &output)
Compute the model coefficients of a glm_fit().
Definition: glm.cpp:775
genesis::utils::GlmOutput::rank
size_t rank
Rank of X after regression on strata.
Definition: glm.hpp:108
genesis::utils::GlmFamily::kGaussian
@ kGaussian
Definition: family.hpp:57
genesis::utils::GlmFamily::canonical_link_id
GlmLink::Link canonical_link_id
Internal ID of the GlmLink, used to check if the link is the canonical one for a given distribution f...
Definition: family.hpp:70
genesis::utils::Matrix::cols
size_t cols() const
Definition: containers/matrix.hpp:181
genesis::utils::GlmOutput
Definition: glm.hpp:100
genesis::utils::udvdu_transpose_
std::vector< double > udvdu_transpose_(size_t M, std::vector< double > const &U, std::vector< double > const &V, double scale)
Calculate U.D.V.D.U-transpose.
Definition: glm.cpp:659
genesis::utils::GlmOutput::scale
double scale
Scale factor (scalar).
Definition: glm.hpp:118
glm.hpp
genesis::utils::GlmFamily
Definition: family.hpp:48
genesis::utils::is_defined
bool is_defined(GlmFamily const &family)
Check whether all necessary values and functors of a GlmFamily are set.
Definition: family.hpp:101
genesis::utils::signum
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Definition: common.hpp:104
common.hpp
genesis::utils::GlmFamily::kGamma
@ kGamma
Definition: family.hpp:58
genesis::utils::glm_link_identity
GlmLink glm_link_identity()
Identity link functions.
Definition: utils/math/regression/link.hpp:153
genesis::utils::GlmOutput::num_iterations
size_t num_iterations
Definition: glm.hpp:103
genesis::utils::GlmControl::max_iterations
size_t max_iterations
Maximum number of iterations to run the IRLS algorithm for (if needed).
Definition: glm.hpp:84
genesis::utils::GlmExtras
Definition: glm.hpp:48
genesis::utils::weighted_mean_centering
GlmFreedom weighted_mean_centering(std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< size_t > const &strata, bool with_intercept, bool centering, std::vector< double > &y_output)
(Weighted) mean and centering.
Definition: utils/math/regression/helper.cpp:71
genesis::utils::glm_fit
GlmOutput glm_fit(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control)
Fit a Generalized Linear Model (GLM).
Definition: glm.cpp:355
genesis::utils::weighted_residuals
double weighted_residuals(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< double > &y_output)
Calculate the residuals from (weighted) regression through the origin.
Definition: utils/math/regression/helper.cpp:221
genesis::utils::GlmOutput::Xb
Matrix< double > Xb
Orthogonal basis for X space (N * M matrix, with N * rank being used).
Definition: glm.hpp:123
genesis::utils::GlmControl
Definition: glm.hpp:79
genesis::utils::GlmFamily::canonical_link
std::function< GlmLink()> canonical_link
Get the canonical link function.
Definition: family.hpp:95
genesis::utils::scale
void scale(Histogram &h, double factor)
Definition: operations.cpp:54
genesis::utils::GlmFreedom
Internal helper structure for GLMs to calcualte the residual degrees of freedom.
Definition: utils/math/regression/helper.hpp:47
genesis::utils::Matrix::col
MatrixCol< self_type, value_type > col(size_t col)
Definition: containers/matrix.hpp:271
genesis::utils::Matrix< double >
genesis::population::to_string
std::string to_string(GenomeLocus const &locus)
Definition: function/genome_locus.hpp:52
helper.hpp
genesis::utils::glm_estimate_intercept
double glm_estimate_intercept(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmOutput const &output, std::vector< double > const &betas)
Compute the intercept resulting from a glm_fit().
Definition: glm.cpp:714
genesis::utils::GlmOutput::null_deviance
double null_deviance
Null deviance.
Definition: glm.hpp:165
genesis::utils::GlmOutput::tri
std::vector< double > tri
Upper unit triangular transformation matrix, with Xb - tr.Xb placed in the diagonal (size (M * (M+1))...
Definition: glm.hpp:156
logging.hpp
Provides easy and fast logging functionality.
genesis::utils::glm_irls_
static void glm_irls_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control, GlmOutput &result)
Definition: glm.cpp:81
genesis::utils::GlmExtras::residual_type
ResidualType residual_type
Definition: glm.hpp:66
genesis::utils::GlmExtras::strata
std::vector< size_t > strata
Strata assignments coded 1...S.
Definition: glm.hpp:57
genesis::utils::GlmOutput::weights
std::vector< double > weights
Weights (size N)
Definition: glm.hpp:138
genesis::utils::GlmOutput::deviance
double deviance
Deviance.
Definition: glm.hpp:174
genesis::utils::GlmControl::max_r2
double max_r2
Threshold for singluarities. Internally used as eta = 1.0 - max_r2.
Definition: glm.hpp:97
genesis::utils::GlmOutput::converged
bool converged
Definition: glm.hpp:102
genesis::utils::GlmExtras::initial_fittings
std::vector< double > initial_fittings
Definition: glm.hpp:50
genesis
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
Definition: placement/formats/edge_color.cpp:42
genesis::utils::GlmControl::epsilon
double epsilon
Proportional change in weighted sum of squares residuals to declare convergence between two iteration...
Definition: glm.hpp:90
genesis::utils::GlmOutput::df_resid
size_t df_resid
Residual degrees of freedom.
Definition: glm.hpp:113
genesis::utils::glm_estimate_betas_inv_tri_
std::vector< double > glm_estimate_betas_inv_tri_(GlmOutput const &output, std::vector< double > const &inv_tri)
Helper function to compute the betas, given that we have already inverted the tri matrix.
Definition: glm.cpp:582
genesis::utils::glm_estimate_betas
std::vector< double > glm_estimate_betas(GlmOutput const &output)
Compute the beta estimates resulting from a glm_fit().
Definition: glm.cpp:605
genesis::utils::glm_family_gaussian
GlmFamily glm_family_gaussian()
Gaussian/normal family functions.
Definition: family.hpp:226
family.hpp
genesis::utils::GlmFamily::rectify
std::function< double(double mu)> rectify
Rectify to a valid value, for the fitted mean, to avoid extreme predictions.
Definition: family.hpp:85
genesis::utils::GlmFreedom::valid_entries
size_t valid_entries
Number of valid priors (Nu).
Definition: utils/math/regression/helper.hpp:52
genesis::utils::GlmFamily::log_likelihood
std::function< double(double y, double mu)> log_likelihood
Log-Likelihood contribution of a value. To be multiplied by prior weight.
Definition: family.hpp:80
genesis::utils::GlmOutput::which
std::vector< double > which
Which columns in the X matrix were estimated (first = 0) (size M).
Definition: glm.hpp:143
genesis::utils::GlmExtras::kDefault
@ kDefault
Definition: glm.hpp:61
genesis::utils::GlmFamily::variance
std::function< double(double mu)> variance
Variance function for the distribution family.
Definition: family.hpp:75
genesis::utils::GlmOutput::fitted
std::vector< double > fitted
Fitted values (size N).
Definition: glm.hpp:128
genesis::utils::GlmExtras::with_intercept
bool with_intercept
Definition: glm.hpp:52
genesis::utils::GlmFreedom::degrees_of_freedom
long degrees_of_freedom(size_t rank) const
Calculate the degrees of freedom (dfr).
Definition: utils/math/regression/helper.hpp:67
genesis::utils::weighted_sum_of_squares
double weighted_sum_of_squares(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of squares.
Definition: utils/math/regression/helper.cpp:290
genesis::utils::udu_transpose_
std::vector< double > udu_transpose_(size_t M, std::vector< double > const &U, double scale)
Calculate U.D.U-transpose.
Definition: glm.cpp:618
genesis::utils::GlmExtras::prior_weights
std::vector< double > prior_weights
Definition: glm.hpp:51
genesis::utils::Matrix::rows
size_t rows() const
Definition: containers/matrix.hpp:176
genesis::utils::glm_estimate_betas_and_var_covar
std::pair< std::vector< double >, std::vector< double > > glm_estimate_betas_and_var_covar(GlmOutput const &output, std::vector< double > const &meat)
Definition: glm.cpp:693
genesis::utils::glm_inv_tri_
std::vector< double > glm_inv_tri_(std::vector< double > const &tri, size_t M)
Invert diagonal and unit upper triangular matrices stored as one array.
Definition: glm.cpp:549
genesis::utils::weighted_sum
double weighted_sum(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of a vector of values.
Definition: utils/math/regression/helper.cpp:364
genesis::utils::GlmOutput::betaQ
std::vector< double > betaQ
Vector of parameter estimates (in terms of basis matrix, Xb) (size M).
Definition: glm.hpp:150
genesis::utils::GlmFamily::id
Family id
Internal ID of the GlmFamily, used to check for specific families where needed.
Definition: family.hpp:64
genesis::utils::GlmFamily::unit_deviance
std::function< double(double y, double mu)> unit_deviance
Unit deviance for the distribution family.
Definition: family.hpp:90
genesis::utils::GlmExtras::kDevianceResiduals
@ kDevianceResiduals
Definition: glm.hpp:63
genesis::utils::GlmOutput::resid
std::vector< double > resid
Working residuals (on linear predictor scale) (size N).
Definition: glm.hpp:133