|
A library for working with phylogenetic and population genetic data.
v0.27.0
|
|
Go to the documentation of this file.
83 std::vector<double>
const& y_response,
91 auto const N = y_response.size();
92 auto const M = x_predictors.
cols();
95 assert( x_predictors.
rows() == N );
97 assert( extras.
strata.empty() || extras.
strata.size() == N );
100 auto y_working = std::vector<double>( N );
107 double logL_prev = 0.0;
109 for(
size_t i = 0; i < N; i++ ) {
122 auto xb_tmp = std::vector<double>();
127 for(
size_t i = 0; i < M; ++i ) {
133 result.
Xb.
col( xb_col ) = xb_tmp;
142 if( result.
rank > 0 ) {
143 for(
size_t j = 0; j < result.
rank; ++j ) {
148 result.
Xb.
col( xb_col ),
152 result.
Xb.
col( xb_col ) = xb_tmp;
155 result.
tri[ij] = bij;
162 if ((ssx > 0.0) && (ssr / ssx > 1.0 - control.
max_r2)) {
164 result.
Xb.
col( xb_col ),
172 assert(( xb_col < M ) xor ( i+1 == M ));
175 result.
tri[ij] = ssr;
176 result.
which[ii] = i;
177 result.
betaQ[ii] = bQi;
188 freedom.valid_entries = 0;
190 for(
size_t i = 0; i < N; ++i ) {
199 if( !( pi != 0.0 && ( result.
weights[i] > 0.0 ))) {
203 double const Vmu = family.
variance( mu );
204 ++freedom.valid_entries;
211 ri = ( y_response[i] - mu ) / Vmu;
215 ri = ( y_response[i] - mu ) / std::sqrt( Vmu );
220 ri =
signum( y_response[i] - mu ) * std::sqrt( ud );
224 throw std::invalid_argument(
"Invalid residual type specified." );
229 wi = pi / (D * D * Vmu);
230 ri = D * (y_response[i] - mu);
235 result.
resid[i] = ri;
238 long const dfr = freedom.degrees_of_freedom( result.
rank );
239 result.
df_resid = dfr > 0 ?
static_cast<size_t>( dfr ) : 0;
241 result.
scale = wss / (dfr);
246 double dL = (logL - logL_prev) / (result.
scale);
255 for(
size_t i = 0; i < N; i++ ) {
266 std::vector<double>
const& y_response,
273 auto const N = y_response.size();
274 auto const M = x_predictors.
cols();
277 assert( x_predictors.
rows() == N );
278 assert( extras.
strata.empty() || extras.
strata.size() == N );
281 auto xb_tmp = std::vector<double>();
286 for(
size_t i = 0; i < M; i++ ) {
291 result.
Xb.
col( xb_col ) = xb_tmp;
297 if( result.
rank > 0 ) {
298 for(
size_t j = 0; j < result.
rank; ++j ) {
301 result.
Xb.
col( xb_col ),
305 result.
Xb.
col( xb_col ) = xb_tmp;
308 result.
tri[ij] = bij;
315 if(( ssx > 0.0) && (ssr / ssx > 1.0 - control.
max_r2 )) {
317 result.
Xb.
col( xb_col ),
327 result.
tri[ij] = ssr;
328 result.
which[ii] = i;
329 result.
betaQ[ii] = bQi;
338 for(
size_t i = 0; i < N; ++i ) {
339 result.
fitted[i] = y_response[i] - result.
resid[i];
344 result.
scale = wss / dfr;
345 result.
df_resid = dfr > 0 ?
static_cast<size_t>( dfr ) : 0;
357 std::vector<double>
const& y_response,
365 auto const N = y_response.size();
366 auto const M = x_predictors.
cols();
369 if( x_predictors.
rows() != N ) {
370 throw std::invalid_argument(
"glm_fit: size of rows of x is not size of y." );
373 throw std::invalid_argument(
"glm_fit: size of initial fittings is not size of y." );
377 throw std::invalid_argument(
"glm_fit: size of prior weights is not size of y." );
380 if( ! extras.
strata.empty() && extras.
strata.size() != N ) {
381 throw std::invalid_argument(
"glm_fit: size of strata is not size of y." );
383 assert( extras.
strata.empty() || extras.
strata.size() == N );
385 throw std::invalid_argument(
"glm_fit: epsilon has to be in ( 0.0, 1.0 ]" );
388 throw std::invalid_argument(
"glm_fit: max_r2 has to be in ( 0.0, 1.0 )" );
391 throw std::invalid_argument(
"glm_fit: family is not properly defined." );
394 throw std::invalid_argument(
"glm_fit: link is not properly defined." );
402 result.
fitted = std::vector<double>( N );
403 result.
resid = std::vector<double>( N );
404 result.
weights = std::vector<double>( N );
405 result.
which = std::vector<double>( M );
406 result.
betaQ = std::vector<double>( M );
407 result.
tri = std::vector<double>( (M * ( M+1 )) / 2 );
410 bool const irls = (M > 0) && !(
431 for(
size_t i = 0; i < N; i++ ) {
432 double const mu = result.
fitted[i];
437 if( std::isfinite( ud )) {
442 if(( ! std::isfinite(pi) ) || ( pi < 0.0 )) {
443 throw std::invalid_argument(
"glm_fit: prior weights have to be non-negative." );
444 }
else if( pi == 0.0 ) {
445 result.
resid[i] = 0.0;
448 assert( std::isfinite( pi ) || ( pi > 0.0 ));
451 double const Vmu = family.
variance( mu );
455 result.
resid[i] = ( y_response[i] - mu ) / Vmu;
459 result.
resid[i] = ( y_response[i] - mu ) / std::sqrt( Vmu );
463 result.
resid[i] =
signum( y_response[i] - mu ) * std::sqrt( ud );
467 throw std::invalid_argument(
"Invalid residual type specified." );
473 result.
resid[i] = D * (y_response[i] - mu);
474 result.
weights[i] = pi / (D * D * Vmu);
487 glm_irls_( x_predictors, y_response, family, link, extras, control, result );
489 glm_gaussian_( x_predictors, y_response, extras, control, freedom, result );
494 for(
size_t i = 0; i < N; i++ ) {
496 if( std::isfinite( ud )) {
501 result.
deviance /=
static_cast<double>(N);
513 result.
df_resid = dfr > 0 ?
static_cast<size_t>( dfr ) : 0;
521 std::vector<double>
const& y_response,
527 throw std::runtime_error(
"glm_fit: family does not provide a canonical link." );
534 std::vector<double>
const& y_response,
539 return glm_fit( x_predictors, y_response, family, family.canonical_link(), extras, control );
static void glm_gaussian_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmExtras const &extras, GlmControl const &control, GlmFreedom const &freedom, GlmOutput &result)
std::function< double(double mu)> link
Link function.
size_t rank
Rank of X after regression on strata.
GlmLink::Link canonical_link_id
Internal ID of the GlmLink, used to check if the link is the canonical one for a given distribution f...
double scale
Scale factor (scalar).
bool is_defined(GlmFamily const &family)
Check whether all necessary values and functors of a GlmFamily are set.
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
std::function< double(double mu)> derivative
Derivative of the link function.
size_t max_iterations
Maximum number of iterations to run the IRLS algorithm for (if needed).
GlmFreedom weighted_mean_centering(std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< size_t > const &strata, bool with_intercept, bool centering, std::vector< double > &y_output)
(Weighted) mean and centering.
GlmOutput glm_fit(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control)
Fit a Generalized Linear Model (GLM).
double weighted_residuals(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< double > &y_output)
Calculate the residuals from (weighted) regression through the origin.
Matrix< double > Xb
Orthogonal basis for X space (N * M matrix, with N * rank being used).
std::function< GlmLink()> canonical_link
Get the canonical link function.
Internal helper structure for GLMs to calcualte the residual degrees of freedom.
MatrixCol< self_type, value_type > col(size_t col)
std::vector< double > tri
Upper unit triangular transformation matrix, with Xb - tr.Xb placed in the diagonal (size (M * (M+1))...
Provides easy and fast logging functionality.
static void glm_irls_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control, GlmOutput &result)
std::vector< double > weights
Weights (size N)
double max_r2
Threshold for singluarities. Internally used as eta = 1.0 - max_r2.
Container namespace for all symbols of genesis in order to keep them separate when used as a library.
double epsilon
Proportional change in weighted sum of squares residuals to declare convergence between two iteration...
size_t df_resid
Residual degrees of freedom.
GlmFamily glm_family_gaussian()
Gaussian/normal family functions.
std::function< double(double mu)> rectify
Rectify to a valid value, for the fitted mean, to avoid extreme predictions.
Link id
Internal ID, used to check if the link is the canonical one for a distribution family.
size_t valid_entries
Number of valid priors (Nu).
std::function< double(double y, double mu)> log_likelihood
Log-Likelihood contribution of a value. To be multiplied by prior weight.
std::vector< double > which
Which columns in the X matrix were estimated (first = 0) (size M).
std::function< double(double eta)> inverse_link
Inverse of the link function.
std::function< double(double mu)> variance
Variance function for the distribution family.
std::vector< double > fitted
Fitted values (size N).
long degrees_of_freedom(size_t rank) const
Calculate the degrees of freedom (dfr).
double weighted_sum_of_squares(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of squares.
std::vector< double > betaQ
Vector of parameter estimates (in terms of basis matrix, Xb) (size M).
Family id
Internal ID of the GlmFamily, used to check for specific families where needed.
std::function< double(double y, double mu)> unit_deviance
Unit deviance for the distribution family.
std::vector< double > resid
Working residuals (on linear predictor scale) (size N).