83 std::vector<double>
const& y_response,
91 auto const N = y_response.size();
92 auto const M = x_predictors.
cols();
95 assert( x_predictors.
rows() == N );
97 assert( extras.
strata.empty() || extras.
strata.size() == N );
100 auto y_working = std::vector<double>( N );
107 double logL_prev = 0.0;
109 for(
size_t i = 0; i < N; i++ ) {
122 auto xb_tmp = std::vector<double>();
127 for(
size_t i = 0; i < M; ++i ) {
133 result.
Xb.
col( xb_col ) = xb_tmp;
142 if( result.
rank > 0 ) {
143 for(
size_t j = 0; j < result.
rank; ++j ) {
148 result.
Xb.
col( xb_col ),
152 result.
Xb.
col( xb_col ) = xb_tmp;
155 result.
tri[ij] = bij;
162 if ((ssx > 0.0) && (ssr / ssx > 1.0 - control.
max_r2)) {
164 result.
Xb.
col( xb_col ),
172 assert(( xb_col < M ) xor ( i+1 == M ));
175 result.
tri[ij] = ssr;
176 result.
which[ii] = i;
177 result.
betaQ[ii] = bQi;
188 freedom.valid_entries = 0;
190 for(
size_t i = 0; i < N; ++i ) {
199 if( !( pi != 0.0 && ( result.
weights[i] > 0.0 ))) {
203 double const Vmu = family.
variance( mu );
204 ++freedom.valid_entries;
211 ri = ( y_response[i] - mu ) / Vmu;
215 ri = ( y_response[i] - mu ) / std::sqrt( Vmu );
220 ri =
signum( y_response[i] - mu ) * std::sqrt( ud );
224 throw std::invalid_argument(
"Invalid residual type specified." );
229 wi = pi / (D * D * Vmu);
230 ri = D * (y_response[i] - mu);
235 result.
resid[i] = ri;
238 long const dfr = freedom.degrees_of_freedom( result.
rank );
239 result.
df_resid = dfr > 0 ?
static_cast<size_t>( dfr ) : 0;
241 result.
scale = wss / (dfr);
246 double dL = (logL - logL_prev) / (result.
scale);
255 for(
size_t i = 0; i < N; i++ ) {
266 std::vector<double>
const& y_response,
273 auto const N = y_response.size();
274 auto const M = x_predictors.
cols();
277 assert( x_predictors.
rows() == N );
278 assert( extras.
strata.empty() || extras.
strata.size() == N );
281 auto xb_tmp = std::vector<double>();
286 for(
size_t i = 0; i < M; i++ ) {
291 result.
Xb.
col( xb_col ) = xb_tmp;
297 if( result.
rank > 0 ) {
298 for(
size_t j = 0; j < result.
rank; ++j ) {
301 result.
Xb.
col( xb_col ),
305 result.
Xb.
col( xb_col ) = xb_tmp;
308 result.
tri[ij] = bij;
315 if(( ssx > 0.0) && (ssr / ssx > 1.0 - control.
max_r2 )) {
317 result.
Xb.
col( xb_col ),
327 result.
tri[ij] = ssr;
328 result.
which[ii] = i;
329 result.
betaQ[ii] = bQi;
338 for(
size_t i = 0; i < N; ++i ) {
339 result.
fitted[i] = y_response[i] - result.
resid[i];
344 result.
scale = wss / dfr;
345 result.
df_resid = dfr > 0 ?
static_cast<size_t>( dfr ) : 0;
357 std::vector<double>
const& y_response,
365 auto const N = y_response.size();
366 auto const M = x_predictors.
cols();
369 if( x_predictors.
rows() != N ) {
370 throw std::invalid_argument(
"glm_fit: size of rows of x is not size of y." );
373 throw std::invalid_argument(
"glm_fit: size of initial fittings is not size of y." );
377 throw std::invalid_argument(
"glm_fit: size of prior weights is not size of y." );
380 if( ! extras.
strata.empty() && extras.
strata.size() != N ) {
381 throw std::invalid_argument(
"glm_fit: size of strata is not size of y." );
383 assert( extras.
strata.empty() || extras.
strata.size() == N );
385 throw std::invalid_argument(
"glm_fit: epsilon has to be in ( 0.0, 1.0 ]" );
388 throw std::invalid_argument(
"glm_fit: max_r2 has to be in ( 0.0, 1.0 )" );
391 throw std::invalid_argument(
"glm_fit: family is not properly defined." );
394 throw std::invalid_argument(
"glm_fit: link is not properly defined." );
402 result.
fitted = std::vector<double>( N );
403 result.
resid = std::vector<double>( N );
404 result.
weights = std::vector<double>( N );
405 result.
which = std::vector<double>( M );
406 result.
betaQ = std::vector<double>( M );
407 result.
tri = std::vector<double>( (M * ( M+1 )) / 2 );
410 bool const irls = (M > 0) && !(
431 for(
size_t i = 0; i < N; i++ ) {
432 double const mu = result.
fitted[i];
437 if( std::isfinite( ud )) {
442 if(( ! std::isfinite(pi) ) || ( pi < 0.0 )) {
443 throw std::invalid_argument(
"glm_fit: prior weights have to be non-negative." );
444 }
else if( pi == 0.0 ) {
445 result.
resid[i] = 0.0;
448 assert( std::isfinite( pi ) || ( pi > 0.0 ));
451 double const Vmu = family.
variance( mu );
455 result.
resid[i] = ( y_response[i] - mu ) / Vmu;
459 result.
resid[i] = ( y_response[i] - mu ) / std::sqrt( Vmu );
463 result.
resid[i] =
signum( y_response[i] - mu ) * std::sqrt( ud );
467 throw std::invalid_argument(
"Invalid residual type specified." );
473 result.
resid[i] = D * (y_response[i] - mu);
474 result.
weights[i] = pi / (D * D * Vmu);
487 glm_irls_( x_predictors, y_response, family, link, extras, control, result );
489 glm_gaussian_( x_predictors, y_response, extras, control, freedom, result );
494 for(
size_t i = 0; i < N; i++ ) {
496 if( std::isfinite( ud )) {
501 result.
deviance /=
static_cast<double>(N);
513 result.
df_resid = dfr > 0 ?
static_cast<size_t>( dfr ) : 0;
521 std::vector<double>
const& y_response,
527 throw std::runtime_error(
"glm_fit: family does not provide a canonical link." );
534 std::vector<double>
const& y_response,
539 return glm_fit( x_predictors, y_response, family, family.canonical_link(), extras, control );
std::vector< double > which
Which columns in the X matrix were estimated (first = 0) (size M).
double weighted_sum_of_squares(std::vector< double > const &x_input, std::vector< double > const &weights)
(Weighted) sum of squares.
double weighted_residuals(std::vector< double > const &x_input, std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< double > &y_output)
Calculate the residuals from (weighted) regression through the origin.
std::function< double(double y, double mu)> unit_deviance
Unit deviance for the distribution family.
std::vector< double > tri
Upper unit triangular transformation matrix, with Xb - tr.Xb placed in the diagonal (size (M * (M+1))...
Internal helper structure for GLMs to calcualte the residual degrees of freedom.
GlmFamily glm_family_gaussian()
Gaussian/normal family functions.
size_t valid_entries
Number of valid priors (Nu).
constexpr int signum(T x, std::false_type)
Implementation of signum(T x) for unsigned types. See there for details.
Container namespace for all symbols of genesis in order to keep them separate when used as a library...
bool is_defined(GlmFamily const &family)
Check whether all necessary values and functors of a GlmFamily are set.
size_t max_iterations
Maximum number of iterations to run the IRLS algorithm for (if needed).
double scale
Scale factor (scalar).
std::vector< double > weights
Weights (size N)
GlmFreedom weighted_mean_centering(std::vector< double > const &y_input, std::vector< double > const &weights, std::vector< size_t > const &strata, bool with_intercept, bool centering, std::vector< double > &y_output)
(Weighted) mean and centering.
Matrix< double > Xb
Orthogonal basis for X space (N * M matrix, with N * rank being used).
MatrixCol< self_type, value_type > col(size_t col)
std::function< double(double mu)> variance
Variance function for the distribution family.
std::function< GlmLink()> canonical_link
Get the canonical link function.
static void glm_gaussian_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmExtras const &extras, GlmControl const &control, GlmFreedom const &freedom, GlmOutput &result)
std::function< double(double mu)> derivative
Derivative of the link function.
static void glm_irls_(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control, GlmOutput &result)
std::vector< double > resid
Working residuals (on linear predictor scale) (size N).
double max_r2
Threshold for singluarities. Internally used as eta = 1.0 - max_r2.
std::function< double(double mu)> rectify
Rectify to a valid value, for the fitted mean, to avoid extreme predictions.
Provides easy and fast logging functionality.
double epsilon
Proportional change in weighted sum of squares residuals to declare convergence between two iteration...
GlmOutput glm_fit(Matrix< double > const &x_predictors, std::vector< double > const &y_response, GlmFamily const &family, GlmLink const &link, GlmExtras const &extras, GlmControl const &control)
Fit a Generalized Linear Model (GLM).
std::function< double(double mu)> link
Link function.
long degrees_of_freedom(size_t rank) const
Calculate the degrees of freedom (dfr).
GlmLink::Link canonical_link_id
Internal ID of the GlmLink, used to check if the link is the canonical one for a given distribution f...
std::function< double(double y, double mu)> log_likelihood
Log-Likelihood contribution of a value. To be multiplied by prior weight.
std::vector< double > betaQ
Vector of parameter estimates (in terms of basis matrix, Xb) (size M).
size_t rank
Rank of X after regression on strata.
Link id
Internal ID, used to check if the link is the canonical one for a distribution family.
Family id
Internal ID of the GlmFamily, used to check for specific families where needed.
size_t df_resid
Residual degrees of freedom.
std::function< double(double eta)> inverse_link
Inverse of the link function.
std::vector< double > fitted
Fitted values (size N).